From 82cc97004ccdbc81188d85f682eb6dabd69b0ee0 Mon Sep 17 00:00:00 2001 From: Vladimir Sokolovsky Date: Mon, 29 Sep 2014 15:26:23 +0300 Subject: [PATCH] xeon-phi: Updates to qib driver Signed-off-by: Jubin John Signed-off-by: Vladimir Sokolovsky --- .../xeon-phi/0013-Updates-to-qib-driver.patch | 4787 +++++++++++++++++ 1 file changed, 4787 insertions(+) create mode 100644 tech-preview/xeon-phi/0013-Updates-to-qib-driver.patch diff --git a/tech-preview/xeon-phi/0013-Updates-to-qib-driver.patch b/tech-preview/xeon-phi/0013-Updates-to-qib-driver.patch new file mode 100644 index 0000000..abc3a3a --- /dev/null +++ b/tech-preview/xeon-phi/0013-Updates-to-qib-driver.patch @@ -0,0 +1,4787 @@ +From 6975d8e44fc3f04c14cf4f83e2df6f69a25546dc Mon Sep 17 00:00:00 2001 +From: Jubin John +Date: Fri, 26 Sep 2014 09:41:32 -0700 +Subject: [PATCH] Updates to qib driver + +--- + drivers/infiniband/hw/qib/Makefile | 2 +- + drivers/infiniband/hw/qib/qib.h | 172 +++++- + drivers/infiniband/hw/qib/qib_driver.c | 223 +++++++- + drivers/infiniband/hw/qib/qib_file_ops.c | 166 ++++-- + drivers/infiniband/hw/qib/qib_iba6120.c | 12 +- + drivers/infiniband/hw/qib/qib_iba7220.c | 20 +- + drivers/infiniband/hw/qib/qib_iba7322.c | 122 ++-- + drivers/infiniband/hw/qib/qib_init.c | 118 +++-- + drivers/infiniband/hw/qib/qib_knx.c | 721 +++++++++++++++++++-- + drivers/infiniband/hw/qib/qib_knx.h | 13 +- + drivers/infiniband/hw/qib/qib_knx_common.h | 126 ++++ + drivers/infiniband/hw/qib/qib_knx_sdma.h | 105 --- + drivers/infiniband/hw/qib/qib_knx_tidrcv.h | 48 -- + drivers/infiniband/hw/qib/qib_mad.c | 3 +- + drivers/infiniband/hw/qib/qib_pcie.c | 21 +- + drivers/infiniband/hw/qib/qib_qp.c | 6 +- + drivers/infiniband/hw/qib/qib_sdma.c | 11 +- + drivers/infiniband/hw/qib/qib_snoop.c | 970 ++++++++++++++++++++++++++++ + drivers/infiniband/hw/qib/qib_user_sdma.c | 296 +++++---- + drivers/infiniband/hw/qib/qib_user_sdma.h | 105 +++- + drivers/infiniband/hw/qib/qib_verbs.c | 116 ++++- + 21 files changed, 2831 insertions(+), 545 deletions(-) + create mode 100644 drivers/infiniband/hw/qib/qib_knx_common.h + delete mode 100644 drivers/infiniband/hw/qib/qib_knx_sdma.h + delete mode 100644 drivers/infiniband/hw/qib/qib_knx_tidrcv.h + create mode 100644 drivers/infiniband/hw/qib/qib_snoop.c + +diff --git a/drivers/infiniband/hw/qib/Makefile b/drivers/infiniband/hw/qib/Makefile +index ba2a49d..047d191 100644 +--- a/drivers/infiniband/hw/qib/Makefile ++++ b/drivers/infiniband/hw/qib/Makefile +@@ -6,7 +6,7 @@ ib_qib-y := qib_cq.o qib_diag.o qib_dma.o qib_driver.o qib_eeprom.o \ + qib_qp.o qib_qsfp.o qib_rc.o qib_ruc.o qib_sdma.o qib_srq.o \ + qib_sysfs.o qib_twsi.o qib_tx.o qib_uc.o qib_ud.o \ + qib_user_pages.o qib_user_sdma.o qib_verbs_mcast.o qib_iba7220.o \ +- qib_sd7220.o qib_iba7322.o qib_verbs.o ++ qib_sd7220.o qib_iba7322.o qib_snoop.o qib_verbs.o + + # 6120 has no fallback if no MSI interrupts, others can do INTx + ib_qib-$(CONFIG_PCI_MSI) += qib_iba6120.o +diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h +index ad87abd..e34b0f7 100644 +--- a/drivers/infiniband/hw/qib/qib.h ++++ b/drivers/infiniband/hw/qib/qib.h +@@ -52,6 +52,7 @@ + #include + #include + #include ++#include + + #include "qib_common.h" + #include "qib_verbs.h" +@@ -247,6 +248,10 @@ struct qib_ctxtdata { + u32 lookaside_qpn; + /* QPs waiting for context processing */ + struct list_head qp_wait_list; ++#ifdef QIB_CONFIG_KNX ++ /* KNX Receive Context Data */ ++ struct qib_knx_ctxt *krcd; ++#endif + #ifdef CONFIG_DEBUG_FS + /* verbs stats per CTX */ + struct qib_opcode_stats_perctx *opstats; +@@ -546,6 +551,11 @@ struct xmit_wait { + * clarifies things a bit. Note that to conform to IB conventions, + * port-numbers are one-based. The first or only port is port1. + */ ++#define QIB_CHAR_DEVICES_PER_PORT 2 ++/* Extract packet length from LRH header */ ++#define QIB_GET_PKT_LEN(x) (((be16_to_cpu((x)->lrh[2]) & 0x7FF)) << 2) ++#define QIB_SNOOP_DEV_INDEX 0 ++#define QIB_CAPTURE_DEV_INDEX 1 + struct qib_pportdata { + struct qib_ibport ibport_data; + +@@ -656,6 +666,7 @@ struct qib_pportdata { + u8 link_speed_active; + u8 vls_supported; + u8 vls_operational; ++ u8 n_krcv_queues; + /* Rx Polarity inversion (compensate for ~tx on partner) */ + u8 rx_pol_inv; + +@@ -675,6 +686,22 @@ struct qib_pportdata { + struct xmit_wait cong_stats; + struct timer_list symerr_clear_timer; + ++ /* snoop/capture related fields */ ++ unsigned int mode_flag; ++ void *filter_value; ++ int (*filter_callback)(void *hdr, void *data, void *value); ++ /* lock while sending packet out */ ++ spinlock_t snoop_write_lock; ++ struct qib_aux_device { ++ struct cdev *snoop_cdev; ++ struct device *snoop_class_dev; ++ /* snooping lock */ ++ spinlock_t snoop_lock; ++ struct list_head snoop_queue; ++ wait_queue_head_t snoop_waitq; ++ struct qib_pportdata *pport; ++ } sc_device[QIB_CHAR_DEVICES_PER_PORT]; ++ + /* Synchronize access between driver writes and sysfs reads */ + spinlock_t cc_shadow_lock + ____cacheline_aligned_in_smp; +@@ -755,14 +782,14 @@ struct qib_devdata { + + /* mem-mapped base of chip regs plus offset of the SendBufAvail0 + * register +- */ ++ */ + u64 sendbufavail0; + + /* end of mem-mapped chip space excluding sendbuf and user regs */ + u64 __iomem *kregend; + /* physical address of chip for io_remap, etc. */ + resource_size_t physaddr; +- /* qib_cfgctxts pointers */ ++ /* cfgctxts pointers */ + struct qib_ctxtdata **rcd; /* Receive Context Data */ + + /* qib_pportdata, points to array of (physical) port-specific +@@ -1079,7 +1106,6 @@ struct qib_devdata { + u8 num_pports; + /* Lowest context number which can be used by user processes */ + u8 first_user_ctxt; +- u8 n_krcv_queues; + u8 qpn_mask; + u8 skip_kctxt_mask; + +@@ -1126,13 +1152,119 @@ struct qib_devdata { + int assigned_node_id; /* NUMA node closest to HCA */ + + #ifdef QIB_CONFIG_KNX +- /* peer node id of connected KNX node */ +- u16 node_id; +- struct qib_knx *knx; ++ /* number of KNx nodes using this device */ ++ u16 num_knx; + #endif ++}; + ++enum qib_mod_param_t { ++ qib_mod_param_drv, ++ qib_mod_param_unit, ++ qib_mod_param_port + }; + ++typedef int (*param_set_func_t)(struct qib_devdata *, u8, u64); ++ ++struct qib_mod_param { ++ const char *name; ++ enum qib_mod_param_t type; ++ param_set_func_t func; ++ ulong dflt; ++ struct list_head list; ++ struct list_head pport; ++}; ++ ++extern int qib_set_mod_param(const char *, struct kernel_param *); ++extern int qib_get_mod_param(char *, struct kernel_param *); ++extern u64 qib_read_mod_param(struct qib_mod_param *, u16, u8); ++extern void qib_clean_mod_param(void); ++ ++#define MAX_QIB_PARAM_LEN 128 ++/** ++ * QIB_MODPARAM_GLOBAL - define a global module parameter ++ * @N: name of the module parameter ++ * ++ * Define a global module parameter for use in multiple files. ++ */ ++#define QIB_MODPARAM_GLOBAL(N) \ ++extern struct qib_mod_param qmp_##N ++/** ++ * QIB_MODPARAM_DRV - define a driver-scope module parameter ++ * @N: name of the module parameter ++ * @D: default value ++ * @P: visibility in sysfs ++ * @S: description ++ * ++ * Define a driver-scope (global to the driver instance) module ++ * parameter. ++ */ ++#define QIB_MODPARAM_DRV(N, D, P, S) \ ++ struct qib_mod_param qmp_##N = { \ ++ .name = __stringify(N), \ ++ .type = qib_mod_param_drv, \ ++ .dflt = (ulong)D, \ ++ .pport = { NULL, NULL } \ ++ }; \ ++ module_param_named(N, qmp_##N.dflt, ulong, P); \ ++ MODULE_PARM_DESC(N, S " (dflt: " __stringify(D) ")") ++/** ++ * QIB_MODPARAM_UNIT - define a unit-scope module parameter ++ * @N: name of the module parameter ++ * @F: callback function for dynamic value settings ++ * @D: default value ++ * @P: visibility in sysfs ++ * @D: description ++ * ++ * Define a unit-scope module parameter. Unit-scope module ++ * parameters allows specifying individual values for each of the ++ * QIB units. ++ */ ++#define QIB_MODPARAM_UNIT(N, F, D, P, S) \ ++ struct qib_mod_param qmp_##N = { \ ++ .name = __stringify(N), \ ++ .func = ((P) & S_IWUGO ? F : NULL), \ ++ .type = qib_mod_param_unit, \ ++ .dflt = (ulong)D, \ ++ .pport = { NULL, NULL } \ ++ }; \ ++ module_param_call(N, qib_set_mod_param, qib_get_mod_param, \ ++ &qmp_##N, (P)); \ ++ MODULE_PARM_DESC(N, S " (dflt: " __stringify(D) ")") ++/** ++ * QIB_MODPARAM_PORT - define a port-scope module parameter ++ * @N: name of the module parameter ++ * @F: callback function for dynamic value settings ++ * @D: default value ++ * @P: visibility in sysfs ++ * @D: description ++ * ++ * Define a port-scope module parameter. Port-scope module ++ * parameters allow specifying individual values foe each of the ++ * ports on any of the QIB units. ++ */ ++#define QIB_MODPARAM_PORT(N, F, D, P, S) \ ++ struct qib_mod_param qmp_##N = { \ ++ .name = __stringify(N), \ ++ .func = ((P) & S_IWUGO ? F : NULL), \ ++ .type = qib_mod_param_port, \ ++ .dflt = (ulong)D, \ ++ .pport = { NULL, NULL } \ ++ }; \ ++ module_param_call(N, qib_set_mod_param, qib_get_mod_param, \ ++ &qmp_##N, (P)); \ ++ MODULE_PARM_DESC(N, S " (dflt: " __stringify(D) ")") ++/** ++ * QIB_MODPARAM_GET - retrieve a module parameter value ++ * @N: name of the module parameter ++ * @U: unit number ++ * @P: port number ++ * ++ * Get the value for the specific unit/port. The macro will return ++ * the correct value regardless of a specific value for the ++ * specified unit/port is present or the default should be used. ++ */ ++#define QIB_MODPARAM_GET(N, U, P) qib_read_mod_param(&qmp_##N, U, P) ++ + /* hol_state values */ + #define QIB_HOL_UP 0 + #define QIB_HOL_INIT 1 +@@ -1165,12 +1297,14 @@ struct qib_filedata { + }; + + extern struct list_head qib_dev_list; ++extern struct list_head qib_mod_param_list; + extern spinlock_t qib_devs_lock; + extern struct qib_devdata *qib_lookup(int unit); + extern u32 qib_cpulist_count; + extern unsigned long *qib_cpulist; + + extern unsigned qib_wc_pat; ++extern unsigned int snoop_enable; + extern unsigned qib_cc_table_size; + int qib_init(struct qib_devdata *, int); + int init_chip_wc_pat(struct qib_devdata *dd, u32); +@@ -1230,6 +1364,24 @@ void qib_hol_event(unsigned long); + void qib_disable_after_error(struct qib_devdata *); + int qib_set_uevent_bits(struct qib_pportdata *, const int); + ++#define QIB_PORT_SNOOP_MODE 1U ++#define QIB_PORT_CAPTURE_MODE 2U ++ ++struct snoop_packet { ++ struct list_head list; ++ u32 total_len; ++ u8 data[]; ++}; ++ ++int qib_snoop_add(struct qib_devdata *); ++void qib_snoop_remove(struct qib_devdata *); ++int qib_snoop_rcv_queue_packet(struct qib_pportdata *, void *, ++ void *, u32); ++void qib_snoop_send_queue_packet(struct qib_pportdata *, ++ struct snoop_packet *); ++int snoop_get_header_size(struct qib_devdata *, struct qib_ib_header *, ++ void *, u32); ++ + /* for use in system calls, where we want to know device type, etc. */ + #define ctxt_fp(fp) \ + (((struct qib_filedata *)(fp)->private_data)->rcd) +@@ -1367,7 +1519,7 @@ void qib_sdma_intr(struct qib_pportdata *); + void qib_user_sdma_send_desc(struct qib_pportdata *dd, + struct list_head *pktlist); + int qib_sdma_verbs_send(struct qib_pportdata *, struct qib_sge_state *, +- u32, struct qib_verbs_txreq *); ++ u32, struct qib_verbs_txreq *, struct snoop_packet *); + /* ppd->sdma_lock should be locked before calling this. */ + int qib_sdma_make_progress(struct qib_pportdata *dd); + +@@ -1505,9 +1657,9 @@ const char *qib_get_unit_name(int unit); + #endif + + /* global module parameter variables */ +-extern unsigned qib_ibmtu; +-extern ushort qib_cfgctxts; +-extern ushort qib_num_cfg_vls; ++QIB_MODPARAM_GLOBAL(ibmtu); ++QIB_MODPARAM_GLOBAL(cfgctxts); ++QIB_MODPARAM_GLOBAL(krcvqs); + extern ushort qib_mini_init; /* If set, do few (ideally 0) writes to chip */ + extern unsigned qib_n_krcv_queues; + extern unsigned qib_sdma_fetch_arb; +diff --git a/drivers/infiniband/hw/qib/qib_driver.c b/drivers/infiniband/hw/qib/qib_driver.c +index 5bee08f..e5fb836 100644 +--- a/drivers/infiniband/hw/qib/qib_driver.c ++++ b/drivers/infiniband/hw/qib/qib_driver.c +@@ -43,6 +43,9 @@ + + #include "qib.h" + ++#undef pr_fmt ++#define pr_fmt(fmt) QIB_DRV_NAME " " fmt ++ + /* + * The size has to be longer than this string, so we can append + * board/chip information to it in the init code. +@@ -51,11 +54,21 @@ const char ib_qib_version[] = QIB_DRIVER_VERSION "\n"; + + DEFINE_SPINLOCK(qib_devs_lock); + LIST_HEAD(qib_dev_list); ++LIST_HEAD(qib_mod_param_list); + DEFINE_MUTEX(qib_mutex); /* general driver use */ + +-unsigned qib_ibmtu; +-module_param_named(ibmtu, qib_ibmtu, uint, S_IRUGO); +-MODULE_PARM_DESC(ibmtu, "Set max IB MTU (0=2KB, 1=256, 2=512, ... 5=4096"); ++/* Per-unit/port module parameter value structure ++ * linked to the qib_mod_param structure - one per ++ * unit/port */ ++struct qib_mod_param_pport { ++ struct list_head list; ++ u16 unit; ++ u8 port; ++ u64 value; ++}; ++ ++QIB_MODPARAM_PORT(ibmtu, NULL, 5, S_IRUGO, ++ "Set max IB MTU (0=2KB, 1=256, 2=512, ... 5=4096"); + + unsigned qib_compat_ddr_negotiate = 1; + module_param_named(compat_ddr_negotiate, qib_compat_ddr_negotiate, uint, +@@ -90,6 +103,178 @@ const char *qib_get_unit_name(int unit) + return iname; + } + ++int qib_set_mod_param(const char *str, struct kernel_param *kp) ++{ ++ char *next = (char *)str, *tmp; ++ unsigned long val = 0, dft; ++ u32 unit = 0, port = 0; ++ struct qib_mod_param *param = ++ (struct qib_mod_param *)kp->arg; ++ struct qib_mod_param_pport *pport, *p; ++ int ret = 0; ++ ++ if (strlen(str) >= MAX_QIB_PARAM_LEN) { ++ pr_warn("parameter value too long\n"); ++ ret = -ENOSPC; ++ goto done; ++ } ++ ++ /* qib_dev_list will be empty only when the driver is initially ++ * loading. */ ++ if (list_empty(&qib_dev_list) || !param->pport.next) ++ INIT_LIST_HEAD(¶m->pport); ++ tmp = next; ++ dft = simple_strtoul(tmp, &next, 0); ++ if (next == tmp) { ++ pr_warn("invalid parameter value\n"); ++ ret = -EINVAL; ++ goto done; ++ } ++ /* clear any previously added port entries */ ++ list_for_each_entry_safe(pport, p, ¶m->pport, list) { ++ list_del(&pport->list); ++ kfree(pport); ++ } ++ if (!*next || *next == '\n' || *next == ',') ++ param->dflt = dft; ++ else if (*next && *next == ':') ++ /* no default, rewind the string */ ++ next = tmp; ++ else ++ pr_warn("invalid parameter value\n"); ++ while (*next && next[1]) { ++ if (*next == ',') ++ tmp = ++next; ++ unit = simple_strtoul(tmp, &next, 0); ++ if (param->type == qib_mod_param_port) { ++ if (next == tmp || !*next || *next != ':') { ++ pr_warn("Invalid unit:port argument at \"%s\".\n", ++ tmp); ++ while (*next && *next++ != ',') ++ ; ++ tmp = next; ++ continue; ++ } ++ tmp = ++next; ++ port = simple_strtoul(tmp, &next, 0); ++ if (!port) { ++ /* port numbers start at 1, 0 is invalid */ ++ pr_warn("Invalid argument at \"%s\". Port numbers start at 1.\n", ++ tmp); ++ while (*next && *next++ != ',') ++ ; ++ tmp = next; ++ continue; ++ } ++ } ++ if (next == tmp || *next != '=') { ++ pr_warn("Invalid %s argument at \"%s\".\n", ++ (param->type == qib_mod_param_port ? ++ "port" : "unit"), tmp); ++ while (*next && *next++ != ',') ++ ; ++ tmp = next; ++ continue; ++ } ++ tmp = ++next; ++ val = simple_strtoul(tmp, &next, 0); ++ if (next == tmp) { ++ pr_warn("Invalid value string at \"%s\"\n", tmp); ++ while (*next && *next++ != ',') ++ ; ++ tmp = next; ++ continue; ++ } ++ pport = kzalloc(sizeof(struct qib_mod_param_pport), ++ GFP_KERNEL); ++ if (!pport) { ++ pr_err("no memory for module parameter.\n"); ++ ret = -ENOMEM; ++ goto done; ++ } ++ pport->unit = unit; ++ pport->port = port; ++ pport->value = val; ++ list_add_tail(&pport->list, ¶m->pport); ++ if (!*next || *next == '\n') ++ break; ++ tmp = ++next; ++ } ++ /* add parameter to list so it can be cleaned up */ ++ if (!param->list.next) ++ list_add(¶m->list, &qib_mod_param_list); ++ ++ if (param->func && qib_count_units(NULL, NULL)) { ++ struct qib_devdata *dd; ++ list_for_each_entry(pport, ¶m->pport, list) { ++ param_set_func_t setfunc = param->func; ++ list_for_each_entry(dd, &qib_dev_list, list) ++ if (dd->unit == pport->unit) ++ break; ++ if (!setfunc(dd, pport->port, pport->value)) ++ pr_err("Error setting module parameter %s for IB%u:%u", ++ param->name, ++ pport->unit, ++ pport->port); ++ } ++ } ++done: ++ return ret; ++} ++ ++int qib_get_mod_param(char *buffer, struct kernel_param *kp) ++{ ++ struct qib_mod_param *param = ++ (struct qib_mod_param *)kp->arg; ++ struct qib_mod_param_pport *pport; ++ char *p = buffer; ++ int s = 0; ++ ++ s = scnprintf(p, PAGE_SIZE, "%lu", param->dflt); ++ p += s; ++ ++ if (param->pport.next) ++ list_for_each_entry(pport, ¶m->pport, list) { ++ *p++ = ','; ++ if (param->type == qib_mod_param_unit) ++ s = scnprintf(p, PAGE_SIZE, "%u=%llu", ++ pport->unit, pport->value); ++ else if (param->type == qib_mod_param_port) ++ s = scnprintf(p, PAGE_SIZE, "%u:%u=%llu", ++ pport->unit, pport->port, ++ pport->value); ++ p += s; ++ } ++ return strlen(buffer); ++} ++ ++u64 qib_read_mod_param(struct qib_mod_param *param, u16 unit, u8 port) ++{ ++ struct qib_mod_param_pport *pport; ++ u64 ret = param->dflt; ++ ++ if (param->type != qib_mod_param_drv) ++ if (param->pport.next && !list_empty(¶m->pport)) ++ list_for_each_entry(pport, ¶m->pport, list) ++ if (pport->unit == unit && ++ pport->port == port) ++ ret = pport->value; ++ return ret; ++} ++ ++void qib_clean_mod_param(void) ++{ ++ struct qib_mod_param *p; ++ struct qib_mod_param_pport *pp, *pps; ++ ++ list_for_each_entry(p, &qib_mod_param_list, list) { ++ list_for_each_entry_safe(pp, pps, &p->pport, list) { ++ list_del(&pp->list); ++ kfree(pp); ++ } ++ } ++} ++ + /* + * Return count of units with at least one port ACTIVE. + */ +@@ -456,6 +641,8 @@ u32 qib_kreceive(struct qib_ctxtdata *rcd, u32 *llic, u32 *npkts) + int last; + u64 lval; + struct qib_qp *qp, *nqp; ++ struct snoop_packet *packet = NULL; ++ u32 hdr_len = 0; + + l = rcd->head; + rhf_addr = (__le32 *) rcd->rcvhdrq + l + dd->rhf_offset; +@@ -478,6 +665,25 @@ u32 qib_kreceive(struct qib_ctxtdata *rcd, u32 *llic, u32 *npkts) + /* total length */ + tlen = qib_hdrget_length_in_bytes(rhf_addr); + ebuf = NULL; ++ /* applicable only for capture */ ++ if (unlikely(ppd->mode_flag & QIB_PORT_CAPTURE_MODE)) { ++ int nomatch = 0; ++ /* We want to filter packet before copying it */ ++ if (ppd->filter_callback) ++ nomatch = ppd->filter_callback(hdr, ebuf, ++ ppd->filter_value); ++ if (nomatch == 0) { ++ packet = kzalloc(sizeof(*packet) + tlen, ++ GFP_ATOMIC); ++ if (packet) { ++ /* copy header first */ ++ packet->total_len = tlen; ++ INIT_LIST_HEAD(&packet->list); ++ hdr_len = (u8 *)rhf_addr - (u8 *)hdr; ++ memcpy(packet->data, hdr, hdr_len); ++ } ++ } ++ } + if ((dd->flags & QIB_NODMA_RTAIL) ? + qib_hdrget_use_egr_buf(rhf_addr) : + (etype != RCVHQ_RCV_TYPE_EXPECTED)) { +@@ -512,6 +718,10 @@ u32 qib_kreceive(struct qib_ctxtdata *rcd, u32 *llic, u32 *npkts) + crcs += qib_rcv_hdrerr(rcd, ppd, rcd->ctxt, eflags, l, + etail, rhf_addr, hdr); + else if (etype == RCVHQ_RCV_TYPE_NON_KD) { ++ /* copy packet data */ ++ if (ebuf && packet) ++ memcpy((packet->data + hdr_len), ebuf, ++ (tlen - hdr_len)); + qib_ib_rcv(rcd, hdr, ebuf, tlen); + if (crcs) + crcs--; +@@ -519,6 +729,10 @@ u32 qib_kreceive(struct qib_ctxtdata *rcd, u32 *llic, u32 *npkts) + --*llic; + } + move_along: ++ if (packet) { ++ qib_snoop_send_queue_packet(ppd, packet); ++ packet = NULL; ++ } + l += rsize; + if (l >= maxcnt) + l = 0; +@@ -619,7 +833,8 @@ int qib_set_mtu(struct qib_pportdata *ppd, u16 arg) + ret = -EINVAL; + goto bail; + } +- chk = ib_mtu_enum_to_int(qib_ibmtu); ++ chk = ib_mtu_enum_to_int( ++ QIB_MODPARAM_GET(ibmtu, ppd->dd->unit, ppd->port)); + if (chk > 0 && arg > chk) { + ret = -EINVAL; + goto bail; +diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c +index 6eebad0..376961d 100644 +--- a/drivers/infiniband/hw/qib/qib_file_ops.c ++++ b/drivers/infiniband/hw/qib/qib_file_ops.c +@@ -95,6 +95,9 @@ static ssize_t qib_aio_write(struct kiocb *, const struct iovec *, + unsigned long, loff_t); + static unsigned int qib_poll(struct file *, struct poll_table_struct *); + static int qib_mmapf(struct file *, struct vm_area_struct *); ++static int subctxt_search_ctxts(struct qib_devdata *, struct file *, ++ const struct qib_user_info *); ++ + + static const struct file_operations qib_file_ops = { + .owner = THIS_MODULE, +@@ -1547,6 +1550,14 @@ static int setup_ctxt(struct qib_pportdata *ppd, int ctxt, + + rcd = qib_create_ctxtdata(ppd, ctxt, numa_id); + ++#ifdef QIB_CONFIG_KNX ++ if (uinfo->spu_knx_node_id) ++ /* ++ * Skip allocation of page pointer list for TID ++ * receives. This will be done on the KNX. ++ */ ++ goto no_page_list; ++#endif + /* + * Allocate memory for use in qib_tid_update() at open to + * reduce cost of expected send setup per message segment +@@ -1562,6 +1573,9 @@ static int setup_ctxt(struct qib_pportdata *ppd, int ctxt, + ret = -ENOMEM; + goto bailerr; + } ++#ifdef QIB_CONFIG_KNX ++no_page_list: ++#endif + rcd->userversion = uinfo->spu_userversion; + + ret = init_subctxts(dd, rcd, uinfo); +@@ -1720,52 +1734,66 @@ done: + static int find_shared_ctxt(struct file *fp, + const struct qib_user_info *uinfo) + { +- int devmax, ndev, i; ++ int devmax, ndev; + int ret = 0; ++ struct qib_devdata *dd; + ++#ifdef QIB_CONFIG_KNX ++ /* ++ * In the case we are allocating a context for a KNX process, ++ * Don't loop over all devices but use the one assosiated with the ++ * requesting KNX. ++ */ ++ if (uinfo->spu_knx_node_id) { ++ dd = qib_knx_node_to_dd(uinfo->spu_knx_node_id); ++ if (dd && dd->num_knx) ++ ret = subctxt_search_ctxts(dd, fp, uinfo); ++ goto done; ++ } ++#endif + devmax = qib_count_units(NULL, NULL); + + for (ndev = 0; ndev < devmax; ndev++) { +- struct qib_devdata *dd = qib_lookup(ndev); +-#ifdef QIB_CONFIG_KNX +- /* +- * In the case we are allocating a context for a KNX process, +- * reject any device that is not associated with the +- * requesting KNX. +- */ +- if ((uinfo->spu_knx_node_id && +- dd->node_id != uinfo->spu_knx_node_id)) +- continue; +-#endif ++ dd = qib_lookup(ndev); + /* device portion of usable() */ + if (!(dd && (dd->flags & QIB_PRESENT) && dd->kregbase)) + continue; +- for (i = dd->first_user_ctxt; i < dd->cfgctxts; i++) { +- struct qib_ctxtdata *rcd = dd->rcd[i]; ++ ret = subctxt_search_ctxts(dd, fp, uinfo); ++ if (ret) ++ break; ++ } ++done: ++ return ret; ++} + +- /* Skip ctxts which are not yet open */ +- if (!rcd || !rcd->cnt) +- continue; +- /* Skip ctxt if it doesn't match the requested one */ +- if (rcd->subctxt_id != uinfo->spu_subctxt_id) +- continue; +- /* Verify the sharing process matches the master */ +- if (rcd->subctxt_cnt != uinfo->spu_subctxt_cnt || +- rcd->userversion != uinfo->spu_userversion || +- rcd->cnt >= rcd->subctxt_cnt) { +- ret = -EINVAL; +- goto done; +- } +- ctxt_fp(fp) = rcd; +- subctxt_fp(fp) = rcd->cnt++; +- rcd->subpid[subctxt_fp(fp)] = current->pid; +- tidcursor_fp(fp) = 0; +- rcd->active_slaves |= 1 << subctxt_fp(fp); +- ret = 1; ++static int subctxt_search_ctxts(struct qib_devdata *dd, struct file *fp, ++ const struct qib_user_info *uinfo) ++{ ++ int ret = 0, i; ++ for (i = dd->first_user_ctxt; i < dd->cfgctxts; i++) { ++ struct qib_ctxtdata *rcd = dd->rcd[i]; ++ ++ /* Skip ctxts which are not yet open */ ++ if (!rcd || !rcd->cnt) ++ continue; ++ /* Skip ctxt if it doesn't match the requested one */ ++ if (rcd->subctxt_id != uinfo->spu_subctxt_id) ++ continue; ++ /* Verify the sharing process matches the master */ ++ if (rcd->subctxt_cnt != uinfo->spu_subctxt_cnt || ++ rcd->userversion != uinfo->spu_userversion || ++ rcd->cnt >= rcd->subctxt_cnt) { ++ ret = -EINVAL; + goto done; + } ++ ctxt_fp(fp) = rcd; ++ subctxt_fp(fp) = rcd->cnt++; ++ rcd->subpid[subctxt_fp(fp)] = current->pid; ++ tidcursor_fp(fp) = 0; ++ rcd->active_slaves |= 1 << subctxt_fp(fp); ++ ret = 1; ++ break; + } +- + done: + return ret; + } +@@ -1856,6 +1884,10 @@ static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo) + + if (swminor >= 11 && uinfo->spu_port_alg < QIB_PORT_ALG_COUNT) + alg = uinfo->spu_port_alg; ++ if (swminor <= 11) { ++ qib_pio_avail_bits = 1; ++ qib_rcvhdrpoll = 1; ++ } + + #ifdef QIB_CONFIG_KNX + /* Make sure we have a connection to the KNX module on the right node */ +@@ -1871,13 +1903,38 @@ static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo) + uinfo->spu_subctxt_cnt) { + ret = find_shared_ctxt(fp, uinfo); + if (ret > 0) { +- ret = do_qib_user_sdma_queue_create(fp); ++#ifdef QIB_CONFIG_KNX ++ if (uinfo->spu_knx_node_id) { ++ ret = qib_knx_sdma_queue_create(fp); ++ } else ++#endif ++ ret = do_qib_user_sdma_queue_create(fp); + if (!ret) + assign_ctxt_affinity(fp, (ctxt_fp(fp))->dd); + goto done_ok; + } + } + ++#ifdef QIB_CONFIG_KNX ++ /* ++ * If there is a KNX node set, we pick the device that is ++ * associate with that KNX node ++ */ ++ if (uinfo->spu_knx_node_id) { ++ struct qib_devdata *dd = ++ qib_knx_node_to_dd(uinfo->spu_knx_node_id); ++ if (dd) { ++ ret = find_free_ctxt(dd->unit, fp, uinfo); ++ if (!ret) ++ ret = qib_knx_alloc_ctxt( ++ uinfo->spu_knx_node_id, ++ ctxt_fp(fp)->ctxt); ++ } else ++ ret = -ENXIO; ++ goto done_chk_sdma; ++ } ++ ++#endif + i_minor = iminor(file_inode(fp)) - QIB_USER_MINOR_BASE; + if (i_minor) + ret = find_free_ctxt(i_minor - 1, fp, uinfo); +@@ -1886,25 +1943,6 @@ static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo) + const unsigned int cpu = cpumask_first(¤t->cpus_allowed); + const unsigned int weight = + cpumask_weight(¤t->cpus_allowed); +-#ifdef QIB_CONFIG_KNX +- /* +- * If there is a KNX node set, we pick the device that is on +- * the same NUMA node as the KNX. +- */ +- if (uinfo->spu_knx_node_id) { +- struct qib_devdata *dd = +- qib_knx_node_to_dd(uinfo->spu_knx_node_id); +- if (dd) { +- ret = find_free_ctxt(dd->unit, fp, uinfo); +- if (!ret) +- ret = qib_knx_alloc_ctxt(dd, +- ctxt_fp(fp)->ctxt); +- } else +- ret = -ENXIO; +- goto done_chk_sdma; +- } +-#endif +- + if (weight == 1 && !test_bit(cpu, qib_cpulist)) + if (!find_hca(cpu, &unit) && unit >= 0) + if (!find_free_ctxt(unit, fp, uinfo)) { +@@ -1915,8 +1953,17 @@ static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo) + } + + done_chk_sdma: +- if (!ret) ++ if (!ret) { ++#ifdef QIB_CONFIG_KNX ++ if (uinfo->spu_knx_node_id) { ++ ret = qib_knx_sdma_queue_create(fp); ++ /*if (!ret) ++ ret = qib_knx_setup_tidrcv(fp);*/ ++ goto done_ok; ++ } ++#endif + ret = do_qib_user_sdma_queue_create(fp); ++ } + done_ok: + #ifdef QIB_CONFIG_KNX + knx_node_fp(fp) = uinfo->spu_knx_node_id; +@@ -2145,6 +2192,13 @@ static int qib_close(struct inode *in, struct file *fp) + + /* drain user sdma queue */ + if (fd->pq) { ++#ifdef QIB_CONFIG_KNX ++ /* ++ * The thread should be stopped first before attempting ++ * to clean the queue. ++ */ ++ qib_knx_sdma_queue_destroy(fd); ++#endif + qib_user_sdma_queue_drain(rcd->ppd, fd->pq); + qib_user_sdma_queue_destroy(fd->pq); + } +@@ -2737,4 +2791,6 @@ void qib_device_remove(struct qib_devdata *dd) + { + qib_user_remove(dd); + qib_diag_remove(dd); ++ if (snoop_enable) ++ qib_snoop_remove(dd); + } +diff --git a/drivers/infiniband/hw/qib/qib_iba6120.c b/drivers/infiniband/hw/qib/qib_iba6120.c +index 84e593d..9ab46ed 100644 +--- a/drivers/infiniband/hw/qib/qib_iba6120.c ++++ b/drivers/infiniband/hw/qib/qib_iba6120.c +@@ -2070,15 +2070,16 @@ qib_6120_get_msgheader(struct qib_devdata *dd, __le32 *rhf_addr) + + static void qib_6120_config_ctxts(struct qib_devdata *dd) + { ++ u32 nkrcvqs = QIB_MODPARAM_GET(krcvqs, dd->unit, 0); + dd->ctxtcnt = qib_read_kreg32(dd, kr_portcnt); +- if (qib_n_krcv_queues > 1) { +- dd->first_user_ctxt = qib_n_krcv_queues * dd->num_pports; ++ if (nkrcvqs > 1) { ++ dd->first_user_ctxt = nkrcvqs * dd->num_pports; + if (dd->first_user_ctxt > dd->ctxtcnt) + dd->first_user_ctxt = dd->ctxtcnt; + dd->qpn_mask = dd->first_user_ctxt <= 2 ? 2 : 6; + } else + dd->first_user_ctxt = dd->num_pports; +- dd->n_krcv_queues = dd->first_user_ctxt; ++ dd->pport[0].n_krcv_queues = dd->first_user_ctxt; + } + + static void qib_update_6120_usrhead(struct qib_ctxtdata *rcd, u64 hd, +@@ -3133,7 +3134,7 @@ static void get_6120_chip_params(struct qib_devdata *dd) + dd->piosize2k = val & ~0U; + dd->piosize4k = val >> 32; + +- mtu = ib_mtu_enum_to_int(qib_ibmtu); ++ mtu = ib_mtu_enum_to_int(QIB_MODPARAM_GET(ibmtu, dd->unit, 1)); + if (mtu == -1) + mtu = QIB_DEFAULT_MTU; + dd->pport->ibmtu = (u32)mtu; +@@ -3282,7 +3283,7 @@ static int init_6120_variables(struct qib_devdata *dd) + dd->rhf_offset = 0; + + /* we always allocate at least 2048 bytes for eager buffers */ +- ret = ib_mtu_enum_to_int(qib_ibmtu); ++ ret = ib_mtu_enum_to_int(QIB_MODPARAM_GET(ibmtu, dd->unit, 1)); + dd->rcvegrbufsize = ret != -1 ? max(ret, 2048) : QIB_DEFAULT_MTU; + BUG_ON(!is_power_of_2(dd->rcvegrbufsize)); + dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize); +@@ -3322,7 +3323,6 @@ static int init_6120_variables(struct qib_devdata *dd) + if (qib_mini_init) + goto bail; + +- qib_num_cfg_vls = 1; /* if any 6120's, only one VL */ + + ret = qib_create_ctxts(dd); + init_6120_cntrnames(dd); +diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c b/drivers/infiniband/hw/qib/qib_iba7220.c +index 454c2e7..19ad170 100644 +--- a/drivers/infiniband/hw/qib/qib_iba7220.c ++++ b/drivers/infiniband/hw/qib/qib_iba7220.c +@@ -2299,19 +2299,21 @@ static void qib_7220_config_ctxts(struct qib_devdata *dd) + { + unsigned long flags; + u32 nchipctxts; ++ u32 cfgctxts = QIB_MODPARAM_GET(cfgctxts, dd->unit, 0); ++ u32 nkrcvqs = QIB_MODPARAM_GET(krcvqs, dd->unit, 0); + + nchipctxts = qib_read_kreg32(dd, kr_portcnt); + dd->cspec->numctxts = nchipctxts; +- if (qib_n_krcv_queues > 1) { ++ if (nkrcvqs > 1) { + dd->qpn_mask = 0x3e; +- dd->first_user_ctxt = qib_n_krcv_queues * dd->num_pports; ++ dd->first_user_ctxt = nkrcvqs * dd->num_pports; + if (dd->first_user_ctxt > nchipctxts) + dd->first_user_ctxt = nchipctxts; + } else + dd->first_user_ctxt = dd->num_pports; +- dd->n_krcv_queues = dd->first_user_ctxt; ++ dd->pport[0].n_krcv_queues = dd->first_user_ctxt; + +- if (!qib_cfgctxts) { ++ if (!cfgctxts) { + int nctxts = dd->first_user_ctxt + num_online_cpus(); + + if (nctxts <= 5) +@@ -2320,8 +2322,8 @@ static void qib_7220_config_ctxts(struct qib_devdata *dd) + dd->ctxtcnt = 9; + else if (nctxts <= nchipctxts) + dd->ctxtcnt = nchipctxts; +- } else if (qib_cfgctxts <= nchipctxts) +- dd->ctxtcnt = qib_cfgctxts; ++ } else if (cfgctxts <= nchipctxts) ++ dd->ctxtcnt = cfgctxts; + if (!dd->ctxtcnt) /* none of the above, set to max */ + dd->ctxtcnt = nchipctxts; + +@@ -3846,7 +3848,7 @@ static void get_7220_chip_params(struct qib_devdata *dd) + dd->piosize2k = val & ~0U; + dd->piosize4k = val >> 32; + +- mtu = ib_mtu_enum_to_int(qib_ibmtu); ++ mtu = ib_mtu_enum_to_int(QIB_MODPARAM_GET(ibmtu, dd->unit, 1)); + if (mtu == -1) + mtu = QIB_DEFAULT_MTU; + dd->pport->ibmtu = (u32)mtu; +@@ -4084,15 +4086,13 @@ static int qib_init_7220_variables(struct qib_devdata *dd) + ppd->cpspec->chase_timer.function = reenable_7220_chase; + ppd->cpspec->chase_timer.data = (unsigned long)ppd; + +- qib_num_cfg_vls = 1; /* if any 7220's, only one VL */ +- + dd->rcvhdrentsize = QIB_RCVHDR_ENTSIZE; + dd->rcvhdrsize = QIB_DFLT_RCVHDRSIZE; + dd->rhf_offset = + dd->rcvhdrentsize - sizeof(u64) / sizeof(u32); + + /* we always allocate at least 2048 bytes for eager buffers */ +- ret = ib_mtu_enum_to_int(qib_ibmtu); ++ ret = ib_mtu_enum_to_int(QIB_MODPARAM_GET(ibmtu, dd->unit, 1)); + dd->rcvegrbufsize = ret != -1 ? max(ret, 2048) : QIB_DEFAULT_MTU; + BUG_ON(!is_power_of_2(dd->rcvegrbufsize)); + dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize); +diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c +index 016e742..35fc492 100644 +--- a/drivers/infiniband/hw/qib/qib_iba7322.c ++++ b/drivers/infiniband/hw/qib/qib_iba7322.c +@@ -107,9 +107,8 @@ static const unsigned sdma_idle_cnt = 64; + * Number of VLs we are configured to use (to allow for more + * credits per vl, etc.) + */ +-ushort qib_num_cfg_vls = 2; +-module_param_named(num_vls, qib_num_cfg_vls, ushort, S_IRUGO); +-MODULE_PARM_DESC(num_vls, "Set number of Virtual Lanes to use (1-8)"); ++static QIB_MODPARAM_PORT(num_vls, NULL, 2, S_IRUGO, ++ "Set number of Virtual Lanes to use (1-8)"); + + static ushort qib_chase = 1; + module_param_named(chase, qib_chase, ushort, S_IRUGO); +@@ -120,9 +119,8 @@ module_param_named(long_attenuation, qib_long_atten, ushort, S_IRUGO); + MODULE_PARM_DESC(long_attenuation, \ + "attenuation cutoff (dB) for long copper cable setup"); + +-static ushort qib_singleport; +-module_param_named(singleport, qib_singleport, ushort, S_IRUGO); +-MODULE_PARM_DESC(singleport, "Use only IB port 1; more per-port buffer space"); ++static QIB_MODPARAM_UNIT(singleport, NULL, 0, S_IRUGO, ++ "Use only IB port 1; more per-port buffer space"); + + static ushort qib_krcvq01_no_msi; + module_param_named(krcvq01_no_msi, qib_krcvq01_no_msi, ushort, S_IRUGO); +@@ -2395,6 +2393,11 @@ static int qib_7322_bringup_serdes(struct qib_pportdata *ppd) + qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a); + qib_write_kreg(dd, kr_scratch, 0ULL); + ++ /* ensure previous Tx parameters are not still forced */ ++ qib_write_kreg_port(ppd, krp_tx_deemph_override, ++ SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, ++ reset_tx_deemphasis_override)); ++ + if (qib_compat_ddr_negotiate) { + ppd->cpspec->ibdeltainprog = 1; + ppd->cpspec->ibsymsnap = read_7322_creg32_port(ppd, +@@ -3515,7 +3518,8 @@ try_intx: + snprintf(dd->cspec->msix_entries[msixnum].name, + sizeof(dd->cspec->msix_entries[msixnum].name) + - 1, +- QIB_DRV_NAME "%d (kctx)", dd->unit); ++ QIB_DRV_NAME "%d:%d (kctx)", dd->unit, ++ ((struct qib_ctxtdata *)arg)->ppd->port); + } + ret = request_irq( + dd->cspec->msix_entries[msixnum].msix.vector, +@@ -3651,10 +3655,10 @@ static unsigned qib_7322_boardname(struct qib_devdata *dd) + dd->majrev, dd->minrev, + (unsigned)SYM_FIELD(dd->revision, Revision_R, SW)); + +- if (qib_singleport && (features >> PORT_SPD_CAP_SHIFT) & PORT_SPD_CAP) { +- qib_devinfo(dd->pcidev, +- "IB%u: Forced to single port mode by module parameter\n", +- dd->unit); ++ if (QIB_MODPARAM_GET(singleport, dd->unit, 0) && ++ (features >> PORT_SPD_CAP_SHIFT) & PORT_SPD_CAP) { ++ qib_devinfo(dd->pcidev, "IB%u: Forced to single port mode" ++ " by module param\n", dd->unit); + features &= PORT_SPD_CAP; + } + +@@ -3941,22 +3945,30 @@ qib_7322_get_msgheader(struct qib_devdata *dd, __le32 *rhf_addr) + static void qib_7322_config_ctxts(struct qib_devdata *dd) + { + unsigned long flags; +- u32 nchipctxts; ++ u32 nchipctxts, nkrcvqs; ++ u32 cfgctxts = QIB_MODPARAM_GET(cfgctxts, dd->unit, 0); ++ u8 pidx; + + nchipctxts = qib_read_kreg32(dd, kr_contextcnt); + dd->cspec->numctxts = nchipctxts; +- if (qib_n_krcv_queues > 1 && dd->num_pports) { +- dd->first_user_ctxt = NUM_IB_PORTS + +- (qib_n_krcv_queues - 1) * dd->num_pports; +- if (dd->first_user_ctxt > nchipctxts) +- dd->first_user_ctxt = nchipctxts; +- dd->n_krcv_queues = dd->first_user_ctxt / dd->num_pports; +- } else { +- dd->first_user_ctxt = NUM_IB_PORTS; +- dd->n_krcv_queues = 1; ++ dd->first_user_ctxt = NUM_IB_PORTS; ++ ++ for (pidx = 0; pidx < dd->num_pports; pidx++) { ++ nkrcvqs = QIB_MODPARAM_GET(krcvqs, dd->unit, pidx+1); ++ if (nkrcvqs > 1) { ++ if (nkrcvqs - 1 > nchipctxts - dd->first_user_ctxt) ++ dd->pport[pidx].n_krcv_queues = ++ (nchipctxts - dd->first_user_ctxt) + 1; ++ else ++ dd->pport[pidx].n_krcv_queues = nkrcvqs; ++ dd->first_user_ctxt += ++ dd->pport[pidx].n_krcv_queues - 1; ++ } else ++ /* Account for the HW ctxt */ ++ dd->pport[pidx].n_krcv_queues = 1; + } + +- if (!qib_cfgctxts) { ++ if (!cfgctxts) { + int nctxts = dd->first_user_ctxt + num_online_cpus(); + + if (nctxts <= 6) +@@ -3965,10 +3977,10 @@ static void qib_7322_config_ctxts(struct qib_devdata *dd) + dd->ctxtcnt = 10; + else if (nctxts <= nchipctxts) + dd->ctxtcnt = nchipctxts; +- } else if (qib_cfgctxts < dd->num_pports) ++ } else if (cfgctxts < dd->num_pports) + dd->ctxtcnt = dd->num_pports; +- else if (qib_cfgctxts <= nchipctxts) +- dd->ctxtcnt = qib_cfgctxts; ++ else if (cfgctxts <= nchipctxts) ++ dd->ctxtcnt = cfgctxts; + if (!dd->ctxtcnt) /* none of the above, set to max */ + dd->ctxtcnt = nchipctxts; + +@@ -5799,7 +5811,6 @@ static void get_7322_chip_params(struct qib_devdata *dd) + { + u64 val; + u32 piobufs; +- int mtu; + + dd->palign = qib_read_kreg32(dd, kr_pagealign); + +@@ -5818,11 +5829,10 @@ static void get_7322_chip_params(struct qib_devdata *dd) + dd->piosize2k = val & ~0U; + dd->piosize4k = val >> 32; + +- mtu = ib_mtu_enum_to_int(qib_ibmtu); +- if (mtu == -1) +- mtu = QIB_DEFAULT_MTU; +- dd->pport[0].ibmtu = (u32)mtu; +- dd->pport[1].ibmtu = (u32)mtu; ++ dd->pport[0].ibmtu = ib_mtu_enum_to_int( ++ QIB_MODPARAM_GET(ibmtu, dd->unit, 1)); ++ dd->pport[1].ibmtu = ib_mtu_enum_to_int( ++ QIB_MODPARAM_GET(ibmtu, dd->unit, 2)); + + /* these may be adjusted in init_chip_wc_pat() */ + dd->pio2kbase = (u32 __iomem *) +@@ -6342,11 +6352,11 @@ static void write_7322_initregs(struct qib_devdata *dd) + qib_write_kreg(dd, KREG_IDX(RcvQPMulticastContext_1), 1); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { +- unsigned n, regno; ++ unsigned i, n, regno, ctxts[18]; + unsigned long flags; + +- if (dd->n_krcv_queues < 2 || +- !dd->pport[pidx].link_speed_supported) ++ if (dd->pport[pidx].n_krcv_queues == 1 || ++ !dd->pport[pidx].link_speed_supported) + continue; + + ppd = &dd->pport[pidx]; +@@ -6359,19 +6369,18 @@ static void write_7322_initregs(struct qib_devdata *dd) + /* Initialize QP to context mapping */ + regno = krp_rcvqpmaptable; + val = 0; +- if (dd->num_pports > 1) +- n = dd->first_user_ctxt / dd->num_pports; +- else +- n = dd->first_user_ctxt - 1; ++ for (i = 0, n = 0; n < dd->first_user_ctxt; n++) { ++ if (dd->skip_kctxt_mask & (1 << n)) ++ continue; ++ if (dd->rcd[n]->ppd->port == pidx+1) ++ ctxts[i++] = n; ++ if (i == ppd->n_krcv_queues) ++ break; ++ } + for (i = 0; i < 32; ) { + unsigned ctxt; + +- if (dd->num_pports > 1) +- ctxt = (i % n) * dd->num_pports + pidx; +- else if (i % n) +- ctxt = (i % n) + 1; +- else +- ctxt = ppd->hw_pidx; ++ ctxt = ctxts[i % ppd->n_krcv_queues]; + val |= ctxt << (5 * (i % 6)); + i++; + if (i % 6 == 0) { +@@ -6419,7 +6428,7 @@ static int qib_init_7322_variables(struct qib_devdata *dd) + { + struct qib_pportdata *ppd; + unsigned features, pidx, sbufcnt; +- int ret, mtu; ++ int ret, maxmtu = 0; + u32 sbufs, updthresh; + + /* pport structs are contiguous, allocated after devdata */ +@@ -6496,10 +6505,6 @@ static int qib_init_7322_variables(struct qib_devdata *dd) + */ + qib_7322_set_baseaddrs(dd); + +- mtu = ib_mtu_enum_to_int(qib_ibmtu); +- if (mtu == -1) +- mtu = QIB_DEFAULT_MTU; +- + dd->cspec->int_enable_mask = QIB_I_BITSEXTANT; + /* all hwerrors become interrupts, unless special purposed */ + dd->cspec->hwerrmask = ~0ULL; +@@ -6509,9 +6514,14 @@ static int qib_init_7322_variables(struct qib_devdata *dd) + ~(SYM_MASK(HwErrMask, IBSerdesPClkNotDetectMask_0) | + SYM_MASK(HwErrMask, IBSerdesPClkNotDetectMask_1) | + HWE_MASK(LATriggered)); +- + for (pidx = 0; pidx < NUM_IB_PORTS; ++pidx) { + struct qib_chippport_specific *cp = ppd->cpspec; ++ int mtu = ib_mtu_enum_to_int( ++ QIB_MODPARAM_GET(ibmtu, dd->unit, pidx+1)); ++ u8 vls = QIB_MODPARAM_GET(num_vls, dd->unit, pidx+1); ++ if (mtu == -1) ++ mtu = QIB_DEFAULT_MTU; ++ maxmtu = max(maxmtu, mtu); + ppd->link_speed_supported = features & PORT_SPD_CAP; + features >>= PORT_SPD_CAP_SHIFT; + if (!ppd->link_speed_supported) { +@@ -6565,7 +6575,7 @@ static int qib_init_7322_variables(struct qib_devdata *dd) + ppd->link_width_active = IB_WIDTH_4X; + ppd->link_speed_active = QIB_IB_SDR; + ppd->delay_mult = ib_rate_to_delay[IB_RATE_10_GBPS]; +- switch (qib_num_cfg_vls) { ++ switch (vls) { + case 1: + ppd->vls_supported = IB_VL_VL0; + break; +@@ -6575,8 +6585,7 @@ static int qib_init_7322_variables(struct qib_devdata *dd) + default: + qib_devinfo(dd->pcidev, + "Invalid num_vls %u, using 4 VLs\n", +- qib_num_cfg_vls); +- qib_num_cfg_vls = 4; ++ vls); + /* fall through */ + case 4: + ppd->vls_supported = IB_VL_VL0_3; +@@ -6588,9 +6597,8 @@ static int qib_init_7322_variables(struct qib_devdata *dd) + qib_devinfo(dd->pcidev, + "Invalid num_vls %u for MTU %d " + ", using 4 VLs\n", +- qib_num_cfg_vls, mtu); ++ vls, mtu); + ppd->vls_supported = IB_VL_VL0_3; +- qib_num_cfg_vls = 4; + } + break; + } +@@ -6640,7 +6648,7 @@ static int qib_init_7322_variables(struct qib_devdata *dd) + dd->rhf_offset = dd->rcvhdrentsize - sizeof(u64) / sizeof(u32); + + /* we always allocate at least 2048 bytes for eager buffers */ +- dd->rcvegrbufsize = max(mtu, 2048); ++ dd->rcvegrbufsize = max(maxmtu, 2048); + BUG_ON(!is_power_of_2(dd->rcvegrbufsize)); + dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize); + +@@ -6698,8 +6706,8 @@ static int qib_init_7322_variables(struct qib_devdata *dd) + goto bail; /* no error, so can still figure out why err */ + } + +- write_7322_initregs(dd); + ret = qib_create_ctxts(dd); ++ write_7322_initregs(dd); + init_7322_cntrnames(dd); + + updthresh = 8U; /* update threshold */ +diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c +index 84b3222..0e83ed4 100644 +--- a/drivers/infiniband/hw/qib/qib_init.c ++++ b/drivers/infiniband/hw/qib/qib_init.c +@@ -67,6 +67,11 @@ + #define QLOGIC_IB_R_SOFTWARE_SHIFT 24 + #define QLOGIC_IB_R_EMULATOR_MASK (1ULL<<62) + ++unsigned int snoop_enable; /* By default (0) snooping is disabled */ ++ ++module_param_named(snoop_enable, snoop_enable , int, 0644); ++MODULE_PARM_DESC(snoop_enable, "snooping mode "); ++ + /* + * Select the NUMA node id on which to allocate the receive header + * queue, eager buffers and send pioavail register. +@@ -79,9 +84,8 @@ MODULE_PARM_DESC(numa_node, "NUMA node on which memory is allocated"); + * Number of ctxts we are configured to use (to allow for more pio + * buffers per ctxt, etc.) Zero means use chip value. + */ +-ushort qib_cfgctxts; +-module_param_named(cfgctxts, qib_cfgctxts, ushort, S_IRUGO); +-MODULE_PARM_DESC(cfgctxts, "Set max number of contexts to use"); ++QIB_MODPARAM_UNIT(cfgctxts, NULL, 0, S_IRUGO, ++ "Set max number of contexts to use"); + + /* + * If set, do not write to any regs if avoidable, hack to allow +@@ -97,9 +101,8 @@ MODULE_PARM_DESC(numa_aware, "Use NUMA aware allocations: " + "0=disabled, 1=enabled, " + "10=option 0 for AMD & <= Intel Westmere cpus and option 1 for newer cpus(default)"); + +-unsigned qib_n_krcv_queues; +-module_param_named(krcvqs, qib_n_krcv_queues, uint, S_IRUGO); +-MODULE_PARM_DESC(krcvqs, "number of kernel receive queues per IB port"); ++QIB_MODPARAM_PORT(krcvqs, NULL, 0, S_IRUGO, ++ "number of kernel receive queues per IB port"); + + unsigned qib_cc_table_size; + module_param_named(cc_table_size, qib_cc_table_size, uint, S_IRUGO); +@@ -123,14 +126,15 @@ unsigned long *qib_cpulist; + /* set number of contexts we'll actually use */ + void qib_set_ctxtcnt(struct qib_devdata *dd) + { +- if (!qib_cfgctxts) { ++ u64 val = QIB_MODPARAM_GET(cfgctxts, dd->unit, 0); ++ if (!val) { + dd->cfgctxts = dd->first_user_ctxt + num_online_cpus(); + if (dd->cfgctxts > dd->ctxtcnt) + dd->cfgctxts = dd->ctxtcnt; +- } else if (qib_cfgctxts < dd->num_pports) ++ } else if (val < dd->num_pports) + dd->cfgctxts = dd->ctxtcnt; +- else if (qib_cfgctxts <= dd->ctxtcnt) +- dd->cfgctxts = qib_cfgctxts; ++ else if (val <= dd->ctxtcnt) ++ dd->cfgctxts = val; + else + dd->cfgctxts = dd->ctxtcnt; + dd->freectxts = (dd->first_user_ctxt > dd->cfgctxts) ? 0 : +@@ -142,13 +146,27 @@ void qib_set_ctxtcnt(struct qib_devdata *dd) + */ + int qib_create_ctxts(struct qib_devdata *dd) + { +- unsigned i; ++ unsigned i, c, p; ++ unsigned port; + int ret; ++ int node_id; + int local_node_id = pcibus_to_node(dd->pcidev->bus); ++ s64 new_node_id = qib_numa_node; + + if (local_node_id < 0) + local_node_id = numa_node_id(); +- dd->assigned_node_id = local_node_id; ++ ++ if (new_node_id < 0) ++ new_node_id = local_node_id; ++ ++ new_node_id = node_online(new_node_id) ? new_node_id : ++ local_node_id; ++ ++ dd->local_node_id = local_node_id; ++ dd->assigned_node_id = new_node_id; ++ ++ node_id = qib_numa_aware ? dd->local_node_id : ++ dd->assigned_node_id; + + /* + * Allocate full ctxtcnt array, rather than just cfgctxts, because +@@ -162,17 +180,29 @@ int qib_create_ctxts(struct qib_devdata *dd) + goto done; + } + ++ c = dd->num_pports ? min( ++ (unsigned)dd->pport[0].n_krcv_queues, ++ (dd->num_pports > 1 ? ++ (unsigned)dd->pport[1].n_krcv_queues : (unsigned)-1)) ++ : 0; ++ p = dd->num_pports > 1 ? ++ (dd->pport[0].n_krcv_queues > dd->pport[1].n_krcv_queues ? ++ 0 : 1) : 0; ++ + /* create (one or more) kctxt */ +- for (i = 0; i < dd->first_user_ctxt; ++i) { ++ for (port = 0, i = 0; i < dd->first_user_ctxt; ++i) { + struct qib_pportdata *ppd; + struct qib_ctxtdata *rcd; + + if (dd->skip_kctxt_mask & (1 << i)) + continue; + +- ppd = dd->pport + (i % dd->num_pports); ++ if (i < (c * dd->num_pports)) ++ ppd = dd->pport + (i % dd->num_pports); ++ else ++ ppd = dd->pport + p; + +- rcd = qib_create_ctxtdata(ppd, i, dd->assigned_node_id); ++ rcd = qib_create_ctxtdata(ppd, i, node_id); + if (!rcd) { + qib_dev_err(dd, + "Unable to allocate ctxtdata for Kernel ctxt, failing\n"); +@@ -722,10 +752,10 @@ int qib_init(struct qib_devdata *dd, int reinit) + if (lastfail) + ret = lastfail; + ppd = dd->pport + pidx; +- mtu = ib_mtu_enum_to_int(qib_ibmtu); ++ mtu = ib_mtu_enum_to_int( ++ QIB_MODPARAM_GET(ibmtu, dd->unit, ppd->port)); + if (mtu == -1) { + mtu = QIB_DEFAULT_MTU; +- qib_ibmtu = 0; /* don't leave invalid value */ + } + /* set max we can ever have for this driver load */ + ppd->init_ibmaxlen = min(mtu > 2048 ? +@@ -750,6 +780,11 @@ int qib_init(struct qib_devdata *dd, int reinit) + lastfail = -ENETDOWN; + continue; + } ++ if (snoop_enable) { ++ ppd->filter_callback = NULL; ++ ppd->filter_value = NULL; ++ ppd->mode_flag = 0; ++ } + + portok++; + } +@@ -1108,24 +1143,7 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra) + unsigned long flags; + struct qib_devdata *dd; + int ret; +- int node_id; +- int local_node_id = pcibus_to_node(dd->pcidev->bus); +- s64 new_node_id = qib_numa_node; +- +- if (local_node_id < 0) +- local_node_id = numa_node_id(); +- +- if (new_node_id < 0) +- new_node_id = local_node_id; +- +- new_node_id = node_online(new_node_id) ? new_node_id : +- local_node_id; +- +- dd->local_node_id = local_node_id; +- dd->assigned_node_id = new_node_id; + +- node_id = qib_numa_aware ? dd->local_node_id : +- dd->assigned_node_id; + + dd = (struct qib_devdata *) ib_alloc_device(sizeof(*dd) + extra); + if (!dd) { +@@ -1273,6 +1291,15 @@ static int __init qlogic_ib_init(void) + if (ret) + goto bail; + ++ if (qib_numa_aware == QIB_DRIVER_AUTO_CONFIGURATION) ++ qib_numa_aware = qib_configure_numa(boot_cpu_data) ? 1 : 0; ++ ++ if (qib_rcvhdrpoll == QIB_DRIVER_AUTO_CONFIGURATION) ++ qib_rcvhdrpoll = qib_configure_numa(boot_cpu_data) ? 0 : 1; ++ ++ if (qib_pio_avail_bits == QIB_DRIVER_AUTO_CONFIGURATION) ++ qib_pio_avail_bits = qib_configure_numa(boot_cpu_data) ? 0 : 1; ++ + /* + * These must be called before the driver is registered with + * the PCI subsystem. +@@ -1298,13 +1325,13 @@ static int __init qlogic_ib_init(void) + #ifdef QIB_CONFIG_KNX + ret = qib_knx_server_init(); + if (ret < 0) +- pr_err("Unable to start KNX listen thread\n"); ++ printk(KERN_ERR QIB_DRV_NAME ++ ": Unable to start KNX listen thread\n"); + #endif +- + goto bail; /* all OK */ + + bail_dev: +-#ifdef CONFIG_INFINIBAND_QIB_DCA ++ #ifdef CONFIG_INFINIBAND_QIB_DCA + dca_unregister_notify(&dca_notifier); + #endif + #ifdef CONFIG_DEBUG_FS +@@ -1328,7 +1355,6 @@ static void __exit qlogic_ib_cleanup(void) + #ifdef QIB_CONFIG_KNX + qib_knx_server_exit(); + #endif +- + ret = qib_exit_qibfs(); + if (ret) + pr_err( +@@ -1348,6 +1374,7 @@ static void __exit qlogic_ib_cleanup(void) + + idr_destroy(&qib_unit_table); + qib_dev_cleanup(); ++ qib_clean_mod_param(); + } + + module_exit(qlogic_ib_cleanup); +@@ -1560,6 +1587,8 @@ static int qib_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) + } + + qib_verify_pioperf(dd); ++ if (snoop_enable) ++ qib_snoop_add(dd); + bail: + return ret; + } +@@ -1572,6 +1601,9 @@ static void qib_remove_one(struct pci_dev *pdev) + /* unregister from IB core */ + qib_unregister_ib_device(dd); + ++#ifdef QIB_CONFIG_KNX ++ qib_knx_remove_device(dd); ++#endif + /* + * Disable the IB link, disable interrupts on the device, + * clear dma engines, etc. +@@ -1686,7 +1718,7 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *rcd) + unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff; + size_t size; + gfp_t gfp_flags; +- int old_node_id; ++ int old_dev_node; + + /* + * GFP_USER, but without GFP_FS, so buffer cache can be +@@ -1706,14 +1738,14 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *rcd) + if (!rcd->rcvegrbuf) { + rcd->rcvegrbuf = + kzalloc_node(chunk * sizeof(rcd->rcvegrbuf[0]), +- GFP_KERNEL, rcd->node_id); ++ GFP_KERNEL, rcd->node_id); + if (!rcd->rcvegrbuf) + goto bail; + } + if (!rcd->rcvegrbuf_phys) { + rcd->rcvegrbuf_phys = + kmalloc_node(chunk * sizeof(rcd->rcvegrbuf_phys[0]), +- GFP_KERNEL, rcd->node_id); ++ GFP_KERNEL, rcd->node_id); + if (!rcd->rcvegrbuf_phys) + goto bail_rcvegrbuf; + } +@@ -1721,13 +1753,13 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *rcd) + if (rcd->rcvegrbuf[e]) + continue; + +- old_node_id = dev_to_node(&dd->pcidev->dev); ++ old_dev_node = dev_to_node(&dd->pcidev->dev); + set_dev_node(&dd->pcidev->dev, rcd->node_id); + rcd->rcvegrbuf[e] = + dma_alloc_coherent(&dd->pcidev->dev, size, + &rcd->rcvegrbuf_phys[e], + gfp_flags); +- set_dev_node(&dd->pcidev->dev, old_node_id); ++ set_dev_node(&dd->pcidev->dev, old_dev_node); + if (!rcd->rcvegrbuf[e]) + goto bail_rcvegrbuf_phys; + } +diff --git a/drivers/infiniband/hw/qib/qib_knx.c b/drivers/infiniband/hw/qib/qib_knx.c +index c15276f..f692913 100644 +--- a/drivers/infiniband/hw/qib/qib_knx.c ++++ b/drivers/infiniband/hw/qib/qib_knx.c +@@ -1,5 +1,5 @@ + /* +- * Copyright (c) 2012 Intel Corporation. All rights reserved. ++ * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU +@@ -37,12 +37,21 @@ + + #include "qib.h" + #include "qib_knx.h" ++#include "qib_user_sdma.h" ++#include "qib_knx_common.h" + + unsigned int qib_knx_nconns = 5; + module_param_named(num_conns, qib_knx_nconns, uint, S_IRUGO); + MODULE_PARM_DESC(num_conns, "Max number of pending connections"); + + #define QIB_KNX_SCIF_PORT SCIF_OFED_PORT_9 ++#define CLIENT_THREAD_NAME(x) "qib/mic" __stringify(x) ++ ++#define knx_sdma_next(sdma) \ ++ sdma->head = ((sdma->head + 1) % sdma->desc_num) ++#define per_ctxt(ctxt, sub) ((ctxt * QLOGIC_IB_MAX_SUBCTXT) + sub) ++#define QIB_KNX_SDMA_STATUS(sdma, st) \ ++ QIB_KNX_SDMA_SET(sdma->mflags->status, ((u64)st << 32) | 1) + + struct qib_knx_server { + struct task_struct *kthread; +@@ -82,7 +91,16 @@ struct qib_knx_mem_map_sg { + struct scif_range *pages; + }; + ++struct qib_knx_tidrcv { ++ struct qib_knx_rma tidmem; ++ u64 tidbase; ++ u32 tidcnt; ++}; ++ + struct qib_knx_ctxt { ++ u16 ctxt; ++ struct qib_knx *knx; ++ struct qib_pportdata *ppd; + /* local registered memory for PIO buffers */ + struct qib_knx_rma piobufs[QLOGIC_IB_MAX_SUBCTXT]; + /* local registered memory for user registers */ +@@ -104,6 +122,23 @@ struct qib_knx_ctxt { + __u64 status; + __u64 piobufbase[QLOGIC_IB_MAX_SUBCTXT]; + __u32 runtime_flags; ++ ++ struct qib_user_sdma_queue *pq[QLOGIC_IB_MAX_SUBCTXT]; ++}; ++ ++struct qib_knx_sdma { ++ /* KNX flags page */ ++ struct scif_range *mflag_pages; ++ struct qib_knx_sdma_mflags *mflags; ++ /* KNX descriptor queue */ ++ struct scif_range *queue_pages; ++ struct qib_knx_sdma_desc *queue; ++ u32 desc_num; ++ /* host flags (in host memory) */ ++ struct qib_knx_rma hflags_mem; ++ struct qib_knx_sdma_hflags *hflags; ++ u32 head; /* shadow */ ++ u32 complete; + }; + + struct qib_knx { +@@ -114,10 +149,16 @@ struct qib_knx { + int numa_node; + struct qib_devdata *dd; + struct qib_knx_ctxt **ctxts; ++ spinlock_t ctxt_lock; ++ resource_size_t bar; ++ u64 barlen; ++ struct qib_knx_sdma *sdma; ++ struct task_struct *sdma_poll; ++ atomic_t tref; ++ char tname[64]; ++ struct qib_knx_rma tidmem; + }; + +-#define CLIENT_THREAD_NAME(x) "qib/mic" __stringify(x) +- + static struct qib_knx_server *server; + + static int qib_knx_init(struct qib_knx_server *); +@@ -127,19 +168,20 @@ static off_t qib_knx_register_memory(struct qib_knx *, struct qib_knx_rma *, + void *, size_t, int, const char *); + static int qib_knx_unregister_memory(struct qib_knx *, struct qib_knx_rma *, + const char *); ++static __always_inline void qib_knx_memcpy(void *, void __iomem *, size_t); + static ssize_t qib_show_knx_node(struct device *, struct device_attribute *, + char *); +- +-static DEVICE_ATTR(knx_node, S_IRUGO, qib_show_knx_node, NULL); +-static ssize_t qib_show_knx_node(struct device *dev, +- struct device_attribute *attr, char *buf) +-{ +- struct qib_ibdev *ibdev = +- container_of(dev, struct qib_ibdev, ibdev.dev); +- struct qib_devdata *dd = dd_from_dev(ibdev); +- +- return scnprintf(buf, PAGE_SIZE, "%u\n", dd->knx->peer.node); +-} ++static int qib_knx_sdma_init(struct qib_knx *); ++static void qib_knx_sdma_teardown(struct qib_knx *); ++static __always_inline struct page * ++qib_knx_phys_to_page(struct qib_knx *, unsigned long); ++static int qib_knx_sdma_pkts_to_descs(struct qib_knx_ctxt *, ++ struct qib_knx_sdma_desc *, ++ struct qib_user_sdma_queue *, ++ int *, struct list_head *); ++static int qib_knx_sdma_poll(void *); ++static int qib_knx_tidrcv_init(struct qib_knx *); ++static int qib_knx_tidrcv_teardown(struct qib_knx *); + + inline struct qib_knx *qib_knx_get(u16 nodeid) + { +@@ -162,10 +204,11 @@ inline struct qib_devdata *qib_knx_node_to_dd(u16 node) + + static int qib_knx_init(struct qib_knx_server *server) + { +- int ret = 0, num_devs = 0, i; +- struct qib_devdata *dd; ++ int ret = 0, num_devs = 0, i, seen = 0; ++ unsigned fewest = -1U; ++ struct qib_devdata *dd = NULL, *dd_no_numa = NULL; + struct qib_knx *knx; +- struct ib_device *ibdev; ++ struct qib_device_info info = { -1 }; + + knx = kzalloc(sizeof(*knx), GFP_KERNEL); + if (!knx) { +@@ -179,10 +222,14 @@ static int qib_knx_init(struct qib_knx_server *server) + } + + INIT_LIST_HEAD(&knx->list); ++ spin_lock_init(&knx->ctxt_lock); + knx->numa_node = -1; + ret = scif_pci_info(knx->peer.node, &knx->pci_info); +- if (!ret) ++ if (!ret) { + knx->numa_node = pcibus_to_node(knx->pci_info.pdev->bus); ++ knx->bar = pci_resource_start(knx->pci_info.pdev, 0); ++ knx->barlen = pci_resource_len(knx->pci_info.pdev, 0); ++ } + + if (knx->numa_node < 0) + knx->numa_node = numa_node_id(); +@@ -190,40 +237,58 @@ static int qib_knx_init(struct qib_knx_server *server) + num_devs = qib_count_units(NULL, NULL); + if (unlikely(!num_devs)) { + ret = -ENODEV; ++ /* we have to send this */ ++ scif_send(knx->epd.epd, &info, sizeof(info), ++ SCIF_SEND_BLOCK); + goto done; + } + +- for (i = 0; i < num_devs; i++) { ++ /* ++ * Attempt to find an HCA on the same NUMA node as the card. Save ++ * the first HCA that hasn't been associated with a card in case ++ * there is no HCA on the same NUMA node. ++ */ ++ for (i = 0; seen < num_devs; i++) { + dd = qib_lookup(i); +- if (dd && dd->local_node_id == knx->numa_node) +- knx->dd = dd; ++ if (dd) { ++ if (dd->local_node_id == knx->numa_node) { ++ knx->dd = dd; ++ break; ++ } else if (dd->num_knx < fewest) ++ dd_no_numa = dd; ++ seen++; ++ } + } + /* + * We didn't find a QIB device on the same NUMA node, +- * round-robin across all devices. ++ * use the "backup". + */ + if (unlikely(!knx->dd)) { +- knx->dd = qib_lookup(server->nclients % num_devs); +- /* it is possible for qib_lookup to return NULL */ +- if (unlikely(!knx->dd)) { ++ if (!dd_no_numa) { + ret = -ENODEV; ++ /* we have to send this */ ++ scif_send(knx->epd.epd, &info, sizeof(info), ++ SCIF_SEND_BLOCK); + goto done; + } ++ knx->dd = dd_no_numa; + } +- knx->dd->node_id = knx->peer.node; +- knx->dd->knx = knx; ++ knx->dd->num_knx++; ++ + knx->ctxts = kzalloc_node(knx->dd->ctxtcnt * sizeof(*knx->ctxts), + GFP_KERNEL, knx->numa_node); + if (!knx->ctxts) + ret = -ENOMEM; +- ibdev = &knx->dd->verbs_dev.ibdev; +- ret = device_create_file(&ibdev->dev, &dev_attr_knx_node); ++ /* Give the KNX the associated device information. */ ++ info.unit = knx->dd->unit; ++ ret = scif_send(knx->epd.epd, &info, sizeof(info), ++ SCIF_SEND_BLOCK); ++ ++ ret = qib_knx_sdma_init(knx); + if (ret) +- /* +- * clear the error code since we don't want to fail the +- * initialization. +- */ +- ret = 0; ++ goto done; ++ atomic_set(&knx->tref, 0); ++ ret = qib_knx_tidrcv_init(knx); + done: + spin_lock(&server->client_lock); + list_add_tail(&knx->list, &server->clients); +@@ -237,13 +302,12 @@ bail: + static void qib_knx_free(struct qib_knx *knx, int unload) + { + struct qib_devdata *dd = knx->dd; +- struct ib_device *ibdev; + int i; + +- if (dd) { +- ibdev = &dd->verbs_dev.ibdev; +- device_remove_file(&ibdev->dev, &dev_attr_knx_node); +- } ++ qib_knx_tidrcv_teardown(knx); ++ qib_knx_sdma_teardown(knx); ++ if (dd) ++ dd->num_knx--; + /* + * If this function is called with unload set, we can + * free the context data. Otherwise, we are here +@@ -360,9 +424,16 @@ done: + return ret; + } + +-int qib_knx_alloc_ctxt(struct qib_devdata *dd, unsigned ctxt) ++static __always_inline void qib_knx_memcpy(void *dst, void __iomem *src, ++ size_t size) + { +- struct qib_knx *knx = dd_to_knx(dd); ++ memcpy_fromio(dst, src, size); ++} ++ ++int qib_knx_alloc_ctxt(u16 node_id, unsigned ctxt) ++{ ++ struct qib_knx *knx = qib_knx_get(node_id); ++ struct qib_devdata *dd = knx->dd; + struct qib_knx_ctxt *ptr; + int ret = 0; + +@@ -379,7 +450,14 @@ int qib_knx_alloc_ctxt(struct qib_devdata *dd, unsigned ctxt) + ret = -ENOMEM; + goto bail; + } ++ ptr->knx = knx; ++ ptr->ctxt = ctxt; ++ ptr->ppd = dd->rcd[ctxt]->ppd; ++ ++ spin_lock(&knx->ctxt_lock); + knx->ctxts[ctxt] = ptr; ++ dd->rcd[ctxt]->krcd = ptr; ++ spin_unlock(&knx->ctxt_lock); + bail: + return ret; + } +@@ -388,10 +466,11 @@ __u64 qib_knx_ctxt_info(struct qib_ctxtdata *rcd, + enum qib_knx_ctxtinfo_type type, + struct file *fp) + { +- struct qib_knx *knx = dd_to_knx(rcd->dd); ++ struct qib_knx *knx = rcd->krcd->knx; + __u16 subctxt; + __u64 ret = 0; + ++ spin_lock(&knx->ctxt_lock); + if (!knx || !knx->ctxts || !knx->ctxts[rcd->ctxt]) + goto done; + +@@ -414,6 +493,7 @@ __u64 qib_knx_ctxt_info(struct qib_ctxtdata *rcd, + break; + } + done: ++ spin_unlock(&knx->ctxt_lock); + return ret; + } + +@@ -424,7 +504,7 @@ int qib_knx_setup_piobufs(struct qib_devdata *dd, struct qib_ctxtdata *rcd, + char buf[16]; + off_t offset; + int ret = 0; +- struct qib_knx *knx = dd_to_knx(dd); ++ struct qib_knx *knx = rcd->krcd->knx; + + if (unlikely(!knx)) { + ret = -ENODEV; +@@ -472,7 +552,7 @@ int qib_knx_setup_pioregs(struct qib_devdata *dd, struct qib_ctxtdata *rcd, + { + int ret = 0; + off_t offset; +- struct qib_knx *knx = dd_to_knx(dd); ++ struct qib_knx *knx = rcd->krcd->knx; + + if (unlikely(!knx)) { + ret = -ENODEV; +@@ -496,7 +576,7 @@ int qib_knx_setup_pioregs(struct qib_devdata *dd, struct qib_ctxtdata *rcd, + goto bail; + } + knx->ctxts[rcd->ctxt]->uregbase = offset; +- ++ + /* + * register the PIO availability registers. + * user status 64bit values are part of the page containing the +@@ -533,7 +613,7 @@ int qib_knx_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd, + { + struct qib_knx_mem_map_sg *mapsg; + struct qib_knx_mem_map *map; +- struct qib_knx *knx = dd_to_knx(dd); ++ struct qib_knx *knx = rcd->krcd->knx; + dma_addr_t offset; + struct scatterlist *sg; + unsigned num_pages; +@@ -590,7 +670,8 @@ int qib_knx_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd, + * can use 64bit addresses for DMA but the CPU might not. + * (see pci_set_dma_mask() in qib_pcie.c). + */ +- mapsg->sglist = kzalloc(num_pages * sizeof(*mapsg->sglist), GFP_KERNEL); ++ mapsg->sglist = kzalloc_node(num_pages * sizeof(*mapsg->sglist), ++ GFP_KERNEL, knx->numa_node); + if (!mapsg->sglist) { + ret = -ENOMEM; + goto bail_rcvq_pages; +@@ -625,7 +706,7 @@ int qib_knx_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd, + } + rcd->rcvhdrq_phys = sg_dma_address(mapsg->sglist); + rcd->rcvhdrq = mapsg->pages->va[0]; +- ++ + map = &knx->ctxts[rcd->ctxt]->sbufstatus; + ret = scif_get_pages(knx->epd.epd, binfo->spi_sendbuf_status, + PAGE_SIZE, &map->pages); +@@ -700,7 +781,7 @@ int qib_knx_setup_eagerbufs(struct qib_ctxtdata *rcd, + struct qib_knx_mem_map_sg *map; + struct scatterlist *sg; + struct qib_devdata *dd = rcd->dd; +- struct qib_knx *knx = dd_to_knx(dd); ++ struct qib_knx *knx = rcd->krcd->knx; + unsigned size, egrsize, egrcnt, num_pages, bufs_ppage, + egrbufcnt; + dma_addr_t dma_addr, page; +@@ -761,7 +842,7 @@ int qib_knx_setup_eagerbufs(struct qib_ctxtdata *rcd, + goto bail_free_scif; + } + } +- ++ + /* + * Allocate array of DMA addresses for each of the mapped + * pages. +@@ -775,10 +856,11 @@ int qib_knx_setup_eagerbufs(struct qib_ctxtdata *rcd, + goto bail_free_rcvegr; + } + } +- ++ + map->size = size; + map->dir = DMA_BIDIRECTIONAL; +- map->sglist = kzalloc(num_pages * sizeof(*map->sglist), GFP_KERNEL); ++ map->sglist = kzalloc_node(num_pages * sizeof(*map->sglist), GFP_KERNEL, ++ knx->numa_node); + if (!map->sglist) { + ret = -ENOMEM; + goto bail_free_rcvegr_phys; +@@ -830,7 +912,7 @@ bail: + + void qib_knx_free_ctxtdata(struct qib_devdata *dd, struct qib_ctxtdata *rcd) + { +- struct qib_knx *knx = dd_to_knx(dd); ++ struct qib_knx *knx = rcd->krcd->knx; + struct qib_knx_ctxt *ctxt; + char buf[16]; + int i, ret = 0; +@@ -838,7 +920,11 @@ void qib_knx_free_ctxtdata(struct qib_devdata *dd, struct qib_ctxtdata *rcd) + if (!rcd || !knx || !knx->ctxts) + return; + ++ spin_lock(&knx->ctxt_lock); + ctxt = knx->ctxts[rcd->ctxt]; ++ knx->ctxts[rcd->ctxt] = NULL; ++ spin_unlock(&knx->ctxt_lock); ++ + if (!ctxt) + return; + +@@ -884,12 +970,535 @@ void qib_knx_free_ctxtdata(struct qib_devdata *dd, struct qib_ctxtdata *rcd) + qib_knx_unregister_memory(knx, &ctxt->piobufs[i], buf); + } + +- /* MITKO XXX: handle rcd->tid_pg_list */ +- knx->ctxts[rcd->ctxt] = NULL; + kfree(ctxt); + kfree(rcd); + } + ++/* ++ * TID management for processes on the MIC happens on the MIC. Therefore, ++ * we only register the HW TID array here. ++ * The MIC will calculate TID array offsets using the same algorithm is ++ * the host. Therefore, it is OK that the entire HW TID array is mapped ++ * since neither side should step on the other. ++ */ ++static int qib_knx_tidrcv_init(struct qib_knx *knx) ++{ ++ struct qib_devdata *dd = knx->dd; ++ struct qib_knx_tid_info info; ++ void *tidbase; ++ int ret = 0; ++ off_t offset = 0; ++ size_t len; ++ char buf[64]; ++ ++ memset(&info, 0, sizeof(info)); ++ ++ info.tidcnt = dd->rcvtidcnt; ++ tidbase = ((char *)dd->kregbase + dd->rcvtidbase); ++ info.tidbase_len = dd->ctxtcnt * dd->rcvtidcnt * sizeof(tidbase); ++ info.tidtemplate = dd->tidtemplate; ++ info.invalidtid = dd->tidinvalid; ++ /* information needed to properly calculate DMA address to MIC pages */ ++ info.bar_addr = knx->bar; ++ info.bar_len = knx->barlen; ++ ++ snprintf(buf, sizeof(buf), "TID array KNx%u", knx->peer.node); ++ offset = qib_knx_register_memory(knx, &knx->tidmem, tidbase, ++ info.tidbase_len, SCIF_PROT_WRITE, ++ buf); ++ info.tidbase_offset = offset; ++ if (IS_ERR_VALUE(offset)) ++ ret = offset; ++ len = scif_send(knx->epd.epd, &info, sizeof(info), ++ SCIF_SEND_BLOCK); ++ if (len < sizeof(info)) ++ ret = -EFAULT; ++ return ret; ++} ++ ++static int qib_knx_tidrcv_teardown(struct qib_knx *knx) ++{ ++ char buf[64]; ++ snprintf(buf, sizeof(buf), "TID array KNx%u", knx->peer.node); ++ return qib_knx_unregister_memory(knx, &knx->tidmem, buf); ++} ++ ++static int qib_knx_sdma_init(struct qib_knx *knx) ++{ ++ struct qib_knx_host_mem flags; ++ struct qib_knx_knc_mem mflags; ++ struct qib_knx_sdma *sdma; ++ char buf[64]; ++ int ret = 0; ++ ++ sdma = kzalloc_node(sizeof(*sdma), GFP_KERNEL, knx->numa_node); ++ if (!sdma) { ++ ret = -ENOMEM; ++ goto done; ++ } ++ sdma->hflags = kzalloc_node(PAGE_SIZE, GFP_KERNEL, knx->numa_node); ++ if (!sdma->hflags) { ++ ret = -ENOMEM; ++ goto done_free; ++ } ++ snprintf(buf, sizeof(buf), "Host SDMA flags KNx%u", knx->peer.node); ++ flags.flags_offset = qib_knx_register_memory(knx, &sdma->hflags_mem, ++ sdma->hflags, ++ PAGE_SIZE, ++ SCIF_PROT_WRITE, ++ buf); ++ if (IS_ERR_VALUE(flags.flags_offset)) { ++ ret = flags.flags_offset; ++ goto free_flags; ++ } ++ sdma->desc_num = knx->dd->pport[0].sdma_descq_cnt; ++ flags.desc_num = sdma->desc_num; ++ ret = scif_send(knx->epd.epd, &flags, sizeof(flags), ++ SCIF_SEND_BLOCK); ++ if (ret < sizeof(flags)) ++ goto unregister; ++ ret = scif_recv(knx->epd.epd, &mflags, sizeof(mflags), ++ SCIF_RECV_BLOCK); ++ if (ret < sizeof(mflags)) { ++ ret = -EINVAL; ++ goto unregister; ++ } ++ ret = scif_get_pages(knx->epd.epd, mflags.flags_offset, ++ PAGE_SIZE, &sdma->mflag_pages); ++ if (ret < 0 || !sdma->mflag_pages->nr_pages) { ++ ret = -EFAULT; ++ goto unregister; ++ } ++ sdma->mflags = sdma->mflag_pages->va[0]; ++ ret = scif_get_pages(knx->epd.epd, mflags.queue_offset, ++ mflags.queue_len, &sdma->queue_pages); ++ if (ret < 0) ++ goto put_flags; ++ if ((sdma->queue_pages->nr_pages * PAGE_SIZE) != ++ mflags.queue_len) { ++ ret = -EFAULT; ++ goto put_queue; ++ } ++ sdma->queue = sdma->queue_pages->va[0]; ++ sdma->complete = -1; ++ sdma->head = -1; ++ /* set the initial trigger value */ ++ QIB_KNX_SDMA_SET(sdma->hflags->trigger, -1); ++ QIB_KNX_SDMA_SET(sdma->mflags->complete, sdma->complete); ++ snprintf(knx->tname, sizeof(knx->tname), "qib/mic%u/poll", ++ knx->peer.node); ++ knx->sdma = sdma; ++ ret = 0; ++ goto done; ++put_queue: ++ scif_put_pages(sdma->queue_pages); ++put_flags: ++ scif_put_pages(sdma->mflag_pages); ++unregister: ++ qib_knx_unregister_memory(knx, &sdma->hflags_mem, buf); ++free_flags: ++ kfree(sdma->hflags); ++done_free: ++ kfree(sdma); ++done: ++ /* ++ * we have to respond to the MIC so it doesn't get stuck ++ * in the scif_recv call ++ */ ++ scif_send(knx->epd.epd, &ret, sizeof(ret), SCIF_SEND_BLOCK); ++ return ret; ++} ++ ++static void qib_knx_sdma_teardown(struct qib_knx *knx) ++{ ++ int ret; ++ if (knx->sdma_poll) ++ ret = kthread_stop(knx->sdma_poll); ++ if (knx->sdma) { ++ if (knx->sdma->queue_pages->nr_pages) { ++ knx->sdma->queue = NULL; ++ scif_put_pages(knx->sdma->queue_pages); ++ } ++ if (knx->sdma->mflag_pages->nr_pages) { ++ knx->sdma->mflags = NULL; ++ scif_put_pages(knx->sdma->mflag_pages); ++ } ++ kfree(knx->sdma->hflags); ++ kfree(knx->sdma); ++ knx->sdma = NULL; ++ } ++} ++ ++int qib_knx_sdma_queue_create(struct file *fd) ++{ ++ struct qib_ctxtdata *rcd = ctxt_fp(fd); ++ struct qib_devdata *dd = rcd->dd; ++ struct qib_knx *knx = rcd->krcd->knx; ++ struct qib_knx_ctxt *ctxt = knx->ctxts[rcd->ctxt]; ++ u8 subctxt = subctxt_fp(fd); ++ int ret = 0; ++ ++ if (!ctxt) { ++ ret = -EINVAL; ++ goto done; ++ } ++ ctxt->pq[subctxt] = qib_user_sdma_queue_create(&dd->pcidev->dev, ++ dd->unit, rcd->ctxt, ++ subctxt); ++ if (!ctxt->pq[subctxt]) ++ ret = -ENOMEM; ++ user_sdma_queue_fp(fd) = ctxt->pq[subctxt]; ++ /* ++ * We start the polling thread the first time a user SDMA ++ * queue is created. There is no reason to take up CPU ++ * cycles before then. ++ */ ++ if (atomic_inc_return(&knx->tref) == 1) { ++ knx->sdma_poll = kthread_run(qib_knx_sdma_poll, knx, ++ knx->tname); ++ if (IS_ERR(knx->sdma_poll)) { ++ ret = -PTR_ERR(knx->sdma_poll); ++ atomic_dec(&knx->tref); ++ goto free_queue; ++ } ++ } ++ goto done; ++free_queue: ++ user_sdma_queue_fp(fd) = NULL; ++ qib_user_sdma_queue_destroy(ctxt->pq[subctxt]); ++ ctxt->pq[subctxt] = NULL; ++done: ++ return ret; ++} ++ ++void qib_knx_sdma_queue_destroy(struct qib_filedata *fd) ++{ ++ struct qib_ctxtdata *rcd = fd->rcd; ++ struct qib_knx *knx; ++ unsigned ctxt = rcd->ctxt, subctxt = fd->subctxt; ++ ++ /* Host processes do not have a KNX rcd pointer. */ ++ if (!rcd->krcd) ++ return; ++ knx = rcd->krcd->knx; ++ /* We still have the memory pointer through fd->pq */ ++ spin_lock(&knx->ctxt_lock); ++ if (knx->ctxts[ctxt]) ++ knx->ctxts[ctxt]->pq[subctxt] = NULL; ++ spin_unlock(&knx->ctxt_lock); ++ if (atomic_dec_and_test(&knx->tref)) { ++ int ret = kthread_stop(knx->sdma_poll); ++ knx->sdma_poll = NULL; ++ } ++} ++ ++/* ++ * Convert a MIC physical address to the corresponding host page. ++ */ ++static __always_inline struct page * ++qib_knx_phys_to_page(struct qib_knx *knx, unsigned long addr) { ++ unsigned long paddr; ++ if ((knx->bar + addr + PAGE_SIZE) > ++ (knx->bar + knx->barlen)) ++ return NULL; ++ paddr = knx->bar + addr; ++ return pfn_to_page(paddr >> PAGE_SHIFT); ++} ++ ++static int qib_knx_sdma_pkts_to_descs(struct qib_knx_ctxt *ctxt, ++ struct qib_knx_sdma_desc *desc, ++ struct qib_user_sdma_queue *pq, ++ int *ndesc, struct list_head *list) ++{ ++ struct qib_knx *knx = ctxt->knx; ++ struct qib_user_sdma_pkt *pkt; ++ dma_addr_t pbc_dma_addr; ++ unsigned pktnw, pbcnw; ++ u32 counter; ++ u16 frag_size; ++ int ret = 0; ++ __le32 *pbc; ++ ++ counter = pq->counter; ++ ++ pbc = qib_user_sdma_alloc_header(pq, desc->pbclen, &pbc_dma_addr); ++ if (!pbc) { ++ ret = -ENOMEM; ++ goto done; ++ } ++ memcpy(pbc, desc->pbc, desc->pbclen); ++ ++ pktnw = (le32_to_cpu(*pbc) & 0xFFFF); ++ /* ++ * This assignment is a bit strange. it's because the ++ * the pbc counts the number of 32 bit words in the full ++ * packet _except_ the first word of the pbc itself... ++ */ ++ pbcnw = (desc->pbclen >> 2) - 1; ++ ++ if (pktnw < pbcnw) { ++ ret = -EINVAL; ++ goto free_pbc; ++ } ++ ++ if (pktnw != ((desc->length >> 2) + pbcnw)) { ++ ret = -EINVAL; ++ goto free_pbc; ++ } ++ ++ frag_size = (le32_to_cpu(*pbc)>>16) & 0xFFFF; ++ if (((frag_size ? frag_size : desc->length) + desc->pbclen) > ++ ctxt->ppd->ibmaxlen) { ++ ret = -EINVAL; ++ goto free_pbc; ++ } ++ if (frag_size) { ++ /* new SDMA "protocol" */ ++ unsigned pktsize, n; ++ ++ n = desc->npages * ((2 * PAGE_SIZE / frag_size) + 1); ++ pktsize = sizeof(*pkt) + sizeof(pkt->addr[0]) * n; ++ ++ pkt = kzalloc(pktsize + desc->tidlen, GFP_KERNEL); ++ if (!pkt) { ++ ret = -ENOMEM; ++ goto free_pbc; ++ } ++ pkt->largepkt = 1; ++ pkt->frag_size = frag_size; ++ pkt->addrlimit = n + ARRAY_SIZE(pkt->addr); ++ ++ if (desc->tidlen) { ++ char *tidsmptr = (char *)pkt + pktsize; ++ memcpy(tidsmptr, desc->tidsm, desc->tidlen); ++ pkt->tidsm = ++ (struct qib_tid_session_member *)tidsmptr; ++ pkt->tidsmcount = desc->tidlen / ++ sizeof(*desc->tidsm); ++ pkt->tidsmidx = 0; ++ } ++ *pbc = cpu_to_le32(le32_to_cpu(*pbc) & 0x0000FFFF); ++ } else { ++ /* old SDMA */ ++ pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL); ++ if (!pkt) { ++ ret = -ENOMEM; ++ goto free_pbc; ++ } ++ pkt->largepkt = 0; ++ pkt->frag_size = desc->length; ++ pkt->addrlimit = ARRAY_SIZE(pkt->addr); ++ } ++ pkt->bytes_togo = desc->length; ++ pkt->payload_size = 0; ++ pkt->counter = counter; ++ pkt->tiddma = !!desc->tidlen; ++ /* ++ * The generic user SDMA code will use this as a flag to ++ * decide whether to call the KNx-specific pkt free ++ * function. However, it doesn't know what the value ++ * actually means. ++ */ ++ pkt->remote = (u64)knx; ++ ++ qib_user_sdma_init_frag(pkt, 0, ++ 0, desc->pbclen, ++ 1, 0, ++ 0, 0, ++ NULL, pbc, ++ pbc_dma_addr, desc->pbclen); ++ pkt->index = 0; ++ pkt->naddr = 1; ++ ++ if (desc->npages) { ++ /* we have user data */ ++ int i; ++ struct page *page; ++ unsigned plen = 0, len = desc->length; ++ for (i = 0; i < desc->npages; i++) { ++ unsigned long off = (i == 0 ? desc->offset : 0); ++ plen = (len > PAGE_SIZE ? PAGE_SIZE : len); ++ page = qib_knx_phys_to_page(knx, desc->pages[i]); ++ ret = qib_user_sdma_page_to_frags(knx->dd, pq, ++ pkt, page, 0, off, ++ (off + plen > PAGE_SIZE ? ++ PAGE_SIZE - off : plen), ++ NULL); ++ if (ret < 0) ++ goto free_sdma; ++ len -= plen - off; ++ } ++ } else { ++ pkt->addr[0].last_desc = 1; ++ if (pbc_dma_addr == 0) { ++ pbc_dma_addr = dma_map_single(&knx->dd->pcidev->dev, ++ pbc, desc->pbclen, ++ DMA_TO_DEVICE); ++ if (dma_mapping_error(&knx->dd->pcidev->dev, ++ pbc_dma_addr)) { ++ ret = -ENOMEM; ++ goto free_sdma; ++ } ++ pkt->addr[0].addr = pbc_dma_addr; ++ pkt->addr[0].dma_mapped = 1; ++ } ++ } ++ counter++; ++ pkt->pq = pq; ++ pkt->index = 0; ++ *ndesc = pkt->naddr; ++ ++ list_add_tail(&pkt->list, list); ++ goto done; ++free_sdma: ++ if (pkt->largepkt) ++ kfree(pkt); ++ else ++ kmem_cache_free(pq->pkt_slab, pkt); ++free_pbc: ++ if (pbc_dma_addr) ++ dma_pool_free(pq->header_cache, pbc, pbc_dma_addr); ++ else ++ kfree(pbc); ++done: ++ return ret; ++} ++ ++void qib_knx_sdma_free_pkt(struct qib_user_sdma_pkt *pkt) ++{ ++ struct qib_knx *knx = (struct qib_knx *)pkt->remote; ++ struct qib_knx_sdma *sdma = knx->sdma; ++ sdma_next(sdma, complete); ++ QIB_KNX_SDMA_SET(sdma->mflags->complete, sdma->complete); ++} ++ ++static int qib_knx_sdma_poll(void *data) ++{ ++ struct qib_knx *knx = (struct qib_knx *)data; ++ struct qib_knx_ctxt *ctxt; ++ struct qib_knx_sdma_desc desc; ++ struct qib_knx_sdma *sdma = knx->sdma; ++ struct qib_user_sdma_queue *pq; ++ struct list_head list; ++ u32 new_head; ++ int ret = 0, ndesc = 0, added; ++ ++ if (!sdma) ++ return -EFAULT; ++ ++ while (!kthread_should_stop()) { ++ added = 0; ++ new_head = QIB_KNX_SDMA_VALUE(sdma->hflags->trigger); ++ while (sdma->head != new_head) { ++ knx_sdma_next(sdma); ++ qib_knx_memcpy(&desc, sdma->queue + sdma->head, ++ sizeof(desc)); ++ if (!desc.ctxt) { ++ QIB_KNX_SDMA_STATUS(sdma, -EINVAL); ++ continue; ++ } ++ spin_lock(&knx->ctxt_lock); ++ ctxt = knx->ctxts[desc.ctxt]; ++ if (!ctxt) { ++ /* we should never get here */ ++ QIB_KNX_SDMA_STATUS(sdma, -EINVAL); ++ goto done_unlock; ++ } ++ pq = ctxt->pq[desc.subctxt]; ++ if (!pq) { ++ QIB_KNX_SDMA_STATUS(sdma, -EFAULT); ++ goto done_unlock; ++ } ++ mutex_lock(&pq->lock); ++ if (pq->added > ctxt->ppd->sdma_descq_removed) ++ qib_user_sdma_hwqueue_clean(ctxt->ppd); ++ if (pq->num_sending) ++ qib_user_sdma_queue_clean(ctxt->ppd, pq); ++ ++ INIT_LIST_HEAD(&list); ++ ret = qib_knx_sdma_pkts_to_descs(ctxt, &desc, pq, ++ &ndesc, &list); ++ QIB_KNX_SDMA_STATUS(sdma, ret); ++ if (!list_empty(&list)) { ++ if (qib_sdma_descq_freecnt(ctxt->ppd) < ++ ndesc) { ++ qib_user_sdma_hwqueue_clean( ++ ctxt->ppd); ++ if (pq->num_sending) ++ qib_user_sdma_queue_clean( ++ ctxt->ppd, pq); ++ } ++ ret = qib_user_sdma_push_pkts(ctxt->ppd, ++ pq, &list, 1); ++ if (ret < 0) ++ goto free_pkts; ++ else { ++ pq->counter++; ++ added++; ++ } ++ } ++free_pkts: ++ if (!list_empty(&list)) ++ qib_user_sdma_free_pkt_list( ++ &knx->dd->pcidev->dev, pq, &list); ++ mutex_unlock(&pq->lock); ++done_unlock: ++ spin_unlock(&knx->ctxt_lock); ++ } ++ if (!added) { ++ int i; ++ /* ++ * Push the queues along ++ * The polling thread will enter the inner loop only ++ * if the KNX has posted new descriptors to the queue. ++ * However, any packets that have been completed by ++ * the HW need to be cleaned and that won't happen ++ * unless we explicitly check. ++ */ ++ for (i = 0; ++ i < knx->dd->ctxtcnt * QLOGIC_IB_MAX_SUBCTXT; ++ i++) { ++ int c = i / QLOGIC_IB_MAX_SUBCTXT, ++ s = i % QLOGIC_IB_MAX_SUBCTXT; ++ spin_lock(&knx->ctxt_lock); ++ ctxt = knx->ctxts[c]; ++ if (!ctxt) ++ goto loop_unlock; ++ pq = ctxt->pq[s]; ++ if (!pq) ++ goto loop_unlock; ++ mutex_lock(&pq->lock); ++ if (pq->num_sending) ++ qib_user_sdma_queue_clean(ctxt->ppd, ++ pq); ++ mutex_unlock(&pq->lock); ++loop_unlock: ++ spin_unlock(&knx->ctxt_lock); ++ } ++ might_sleep(); ++ } ++ } ++ return ret; ++} ++ ++void qib_knx_remove_device(struct qib_devdata *dd) ++{ ++ if (server && dd->num_knx) { ++ struct qib_knx *knx, *knxp; ++ list_for_each_entry_safe(knx, knxp, &server->clients, list) { ++ if (knx->dd == dd) { ++ spin_lock(&server->client_lock); ++ list_del(&knx->list); ++ server->nclients--; ++ spin_unlock(&server->client_lock); ++ qib_knx_free(knx, 0); ++ kfree(knx); ++ } ++ } ++ } ++ return; ++} ++ + int __init qib_knx_server_init(void) + { + server = kzalloc(sizeof(struct qib_knx_server), GFP_KERNEL); +@@ -908,7 +1517,6 @@ void __exit qib_knx_server_exit(void) + { + if (server) { + struct qib_knx *t, *tt; +- + /* Stop the thread so we don't accept any new connections. */ + kthread_stop(server->kthread); + list_for_each_entry_safe(t, tt, &server->clients, list) { +@@ -921,3 +1529,4 @@ void __exit qib_knx_server_exit(void) + kfree(server); + } + } ++ +diff --git a/drivers/infiniband/hw/qib/qib_knx.h b/drivers/infiniband/hw/qib/qib_knx.h +index d767a60..fcb5a3e 100644 +--- a/drivers/infiniband/hw/qib/qib_knx.h ++++ b/drivers/infiniband/hw/qib/qib_knx.h +@@ -1,5 +1,5 @@ + /* +- * Copyright (c) 2012 Intel Corporation. All rights reserved. ++ * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU +@@ -44,13 +44,12 @@ enum qib_knx_ctxtinfo_type { + + int __init qib_knx_server_init(void); + void __exit qib_knx_server_exit(void); +-static __always_inline struct qib_knx *dd_to_knx(struct qib_devdata *dd) +-{ +- return (struct qib_knx *)dd->knx; +-} ++ ++void qib_knx_remove_device(struct qib_devdata *); ++ + inline struct qib_knx *qib_knx_get(uint16_t); + inline struct qib_devdata *qib_knx_node_to_dd(uint16_t); +-int qib_knx_alloc_ctxt(struct qib_devdata *, unsigned); ++int qib_knx_alloc_ctxt(u16, unsigned); + int qib_knx_setup_piobufs(struct qib_devdata *, struct qib_ctxtdata *, __u16); + int qib_knx_setup_pioregs(struct qib_devdata *, struct qib_ctxtdata *, + struct qib_base_info *); +@@ -60,4 +59,6 @@ int qib_knx_setup_eagerbufs(struct qib_ctxtdata *, struct qib_base_info *); + void qib_knx_free_ctxtdata(struct qib_devdata *, struct qib_ctxtdata *); + __u64 qib_knx_ctxt_info(struct qib_ctxtdata *, enum qib_knx_ctxtinfo_type, + struct file *); ++int qib_knx_sdma_queue_create(struct file *); ++void qib_knx_sdma_queue_destroy(struct qib_filedata *); + #endif /* _QIB_KNX_H */ +diff --git a/drivers/infiniband/hw/qib/qib_knx_common.h b/drivers/infiniband/hw/qib/qib_knx_common.h +new file mode 100644 +index 0000000..9639592 +--- /dev/null ++++ b/drivers/infiniband/hw/qib/qib_knx_common.h +@@ -0,0 +1,126 @@ ++/* ++ * Copyright (c) 2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#ifndef _QIB_KNX_COMMON_H ++#define _QIB_KNX_COMMON_H ++ ++struct qib_device_info { ++ u16 unit; ++}; ++ ++#define QIB_SDMA_MAX_NPAGES 33 ++#define QIB_KNX_SDMA_VALUE(fld) (volatile u64)fld ++#define QIB_KNX_SDMA_SET(fld, val) \ ++ do { \ ++ fld = (u64)(val); \ ++ smp_mb(); \ ++ } while (0) ++ ++struct qib_knx_host_mem { ++ off_t flags_offset; ++ unsigned desc_num; ++}; ++ ++struct qib_knx_knc_mem { ++ off_t flags_offset; ++ off_t queue_offset; ++ size_t queue_len; ++}; ++ ++struct qib_tid_sm { ++ __u16 tid; ++ __u16 offset; ++ __u16 length; ++}; ++ ++/* ++ * SDMA transfer descriptor. This structure communicates the SDMA ++ * transfers from the MIC to the host. It is very important for ++ * performance reasons that its size is multiple of 64B in order ++ * to guarantee proper alignment in the descriptor array. ++ */ ++struct qib_knx_sdma_desc { ++ u16 ctxt; ++ u16 subctxt; ++ u32 pbclen; ++ __le32 pbc[16]; ++ u64 length; ++ u32 npages; ++ unsigned tidlen; ++ off_t offset; ++ unsigned long pages[QIB_SDMA_MAX_NPAGES]; ++ /* This array is 198B so the compiler will pad ++ * it by 2B to make it multiple of 8B. */ ++ struct qib_tid_sm tidsm[QIB_SDMA_MAX_NPAGES]; ++ /* ++ * The two paddings below are included in order to ++ * make the size of the entire struct 576B (multiple ++ * of 64B). The goal is that all elements in an array ++ * of struct qib_knx_sdma_desc are 64B aligned. ++ */ ++ u16 __padding0; ++ u64 __padding1[2]; ++}; ++ ++/* ++ * trigger, status, and complete fields are by 8 to be ++ * cacheline size. ++ */ ++struct qib_knx_sdma_hflags { ++ u64 trigger; ++ u64 __padding[7]; ++}; ++ ++#define sdma_next(s, fld) \ ++ (s)->fld = (((s)->fld + 1) == (s)->desc_num) ? 0 : ((s)->fld + 1) ++ ++struct qib_knx_sdma_mflags { ++ u64 status; ++ u64 __padding1[7]; ++ u64 complete; ++ u64 __padding2[7]; ++}; ++ ++struct qib_knx_tid_info { ++ /* this is the entire set of 512 entries (= 4K) so ++ * we can resgister. subctxt devision will be done ++ * in MIC driver. */ ++ off_t tidbase_offset; ++ size_t tidbase_len; ++ u64 tidbase; ++ unsigned tidcnt; ++ u64 tidtemplate; ++ unsigned long invalidtid; ++ u64 bar_addr; ++ u64 bar_len; ++}; ++ ++#endif /* _QIB_KNX_COMMON_H */ +diff --git a/drivers/infiniband/hw/qib/qib_knx_sdma.h b/drivers/infiniband/hw/qib/qib_knx_sdma.h +deleted file mode 100644 +index 8c67b1f..0000000 +--- a/drivers/infiniband/hw/qib/qib_knx_sdma.h ++++ /dev/null +@@ -1,105 +0,0 @@ +-/* +- * Copyright (c) 2013 Intel Corporation. All rights reserved. +- * +- * This software is available to you under a choice of one of two +- * licenses. You may choose to be licensed under the terms of the GNU +- * General Public License (GPL) Version 2, available from the file +- * COPYING in the main directory of this source tree, or the +- * OpenIB.org BSD license below: +- * +- * Redistribution and use in source and binary forms, with or +- * without modification, are permitted provided that the following +- * conditions are met: +- * +- * - Redistributions of source code must retain the above +- * copyright notice, this list of conditions and the following +- * disclaimer. +- * +- * - Redistributions in binary form must reproduce the above +- * copyright notice, this list of conditions and the following +- * disclaimer in the documentation and/or other materials +- * provided with the distribution. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +-#ifndef _QIB_KNX_SDMA_H +-#define _QIB_KNX_SDMA_H +- +-#define QIB_SDMA_MAX_NPAGES 33 +-#define QIB_KNX_SDMA_VALUE(fld) (volatile u64)fld +-#define QIB_KNX_SDMA_SET(fld, val) \ +- do { \ +- fld = (u64)(val); \ +- smp_mb(); \ +- } while (0) +- +-struct qib_knx_host_mem { +- off_t flags_offset; +- unsigned desc_num; +-}; +- +-struct qib_knx_knc_mem { +- off_t flags_offset; +- off_t queue_offset; +- size_t queue_len; +-}; +- +-struct qib_tid_sm { +- __u16 tid; +- __u16 offset; +- __u16 length; +-}; +- +-/* +- * SDMA transfer descriptor. This structure communicates the SDMA +- * transfers from the MIC to the host. It is very important for +- * performance reasons that its size is multiple of 64B in order +- * to guarantee proper alignment in the descriptor array. +- */ +-struct qib_knx_sdma_desc { +- u16 ctxt; +- u16 subctxt; +- u32 pbclen; +- __le32 pbc[16]; +- u64 length; +- u32 npages; +- unsigned tidlen; +- off_t offset; +- unsigned long pages[QIB_SDMA_MAX_NPAGES]; +- /* This array is 198B so the compiler will pad +- * it by 2B to make it multiple of 8B. */ +- struct qib_tid_sm tidsm[QIB_SDMA_MAX_NPAGES]; +- /* +- * The two paddings below are included in order to +- * make the size of the entire struct 576B (multiple +- * of 64B). The goal is that all elements in an array +- * of struct qib_knx_sdma_desc are 64B aligned. +- */ +- u16 __padding0; +- u64 __padding1[2]; +-}; +- +-/* +- * trigger, status, and complete fields are by 8 to be +- * cacheline size. +- */ +-struct qib_knx_sdma_hflags { +- u64 trigger; +- u64 __padding[7]; +-}; +- +-struct qib_knx_sdma_mflags { +- u64 status; +- u64 __padding1[7]; +- u64 complete; +- u64 __padding2[7]; +-}; +- +-#endif /* _QIB_KNX_SDMA_H */ +diff --git a/drivers/infiniband/hw/qib/qib_knx_tidrcv.h b/drivers/infiniband/hw/qib/qib_knx_tidrcv.h +deleted file mode 100644 +index 842fca1..0000000 +--- a/drivers/infiniband/hw/qib/qib_knx_tidrcv.h ++++ /dev/null +@@ -1,48 +0,0 @@ +-/* +- * Copyright (c) 2013 Intel Corporation. All rights reserved. +- * +- * This software is available to you under a choice of one of two +- * licenses. You may choose to be licensed under the terms of the GNU +- * General Public License (GPL) Version 2, available from the file +- * COPYING in the main directory of this source tree, or the +- * OpenIB.org BSD license below: +- * +- * Redistribution and use in source and binary forms, with or +- * without modification, are permitted provided that the following +- * conditions are met: +- * +- * - Redistributions of source code must retain the above +- * copyright notice, this list of conditions and the following +- * disclaimer. +- * +- * - Redistributions in binary form must reproduce the above +- * copyright notice, this list of conditions and the following +- * disclaimer in the documentation and/or other materials +- * provided with the distribution. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +-#ifndef _QIB_KNX_TIDRCV_H +- +-struct qib_knx_tid_info { +- /* this is the entire set of 512 entries (= 4K) so +- * we can resgister. subctxt devision will be done +- * in MIC driver. */ +- off_t tidbase_offset; +- size_t tidbase_len; +- u64 tidbase; +- unsigned tidcnt; +- u64 tidtemplate; +- unsigned long invalidtid; +- u64 bar_addr; +- u64 bar_len; +-}; +- +-#endif /* QIB_KNX_TIDRCV_H */ +diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c +index ccb1191..4b46f6c 100644 +--- a/drivers/infiniband/hw/qib/qib_mad.c ++++ b/drivers/infiniband/hw/qib/qib_mad.c +@@ -536,7 +536,8 @@ static int subn_get_portinfo(struct ib_smp *smp, struct ib_device *ibdev, + pip->vl_arb_low_cap = + dd->f_get_ib_cfg(ppd, QIB_IB_CFG_VL_LOW_CAP); + /* InitTypeReply = 0 */ +- pip->inittypereply_mtucap = qib_ibmtu ? qib_ibmtu : IB_MTU_4096; ++ pip->inittypereply_mtucap = ++ QIB_MODPARAM_GET(ibmtu, dd->unit, ppd->port); + /* HCAs ignore VLStallCount and HOQLife */ + /* pip->vlstallcnt_hoqlife; */ + pip->operationalvl_pei_peo_fpi_fpo = +diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c +index 3f14009..d7eebfb 100644 +--- a/drivers/infiniband/hw/qib/qib_pcie.c ++++ b/drivers/infiniband/hw/qib/qib_pcie.c +@@ -501,9 +501,8 @@ static int val2fld(int wd, int mask) + return wd; + } + +-static int qib_pcie_coalesce; +-module_param_named(pcie_coalesce, qib_pcie_coalesce, int, S_IRUGO); +-MODULE_PARM_DESC(pcie_coalesce, "tune PCIe colescing on some Intel chipsets"); ++static QIB_MODPARAM_UNIT(pcie_coalesce, NULL, 0, S_IRUGO, ++ "tune PCIe colescing on some Intel chipsets"); + + /* + * Enable PCIe completion and data coalescing, on Intel 5x00 and 7300 +@@ -518,7 +517,7 @@ static int qib_tune_pcie_coalesce(struct qib_devdata *dd) + u16 devid; + u32 mask, bits, val; + +- if (!qib_pcie_coalesce) ++ if (!QIB_MODPARAM_GET(pcie_coalesce, dd->unit, 0)) + return 0; + + /* Find out supported and configured values for parent (root) */ +@@ -576,9 +575,8 @@ static int qib_tune_pcie_coalesce(struct qib_devdata *dd) + * BIOS may not set PCIe bus-utilization parameters for best performance. + * Check and optionally adjust them to maximize our throughput. + */ +-static int qib_pcie_caps; +-module_param_named(pcie_caps, qib_pcie_caps, int, S_IRUGO); +-MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)"); ++static QIB_MODPARAM_UNIT(pcie_caps, NULL, 0, S_IRUGO, ++ "Max PCIe tuning: Payload (4lsb), ReadReq (D4..7)"); + + static int qib_tune_pcie_caps(struct qib_devdata *dd) + { +@@ -587,6 +585,7 @@ static int qib_tune_pcie_caps(struct qib_devdata *dd) + u16 pcaps, pctl, ecaps, ectl; + int rc_sup, ep_sup; + int rc_cur, ep_cur; ++ int caps = QIB_MODPARAM_GET(pcie_caps, dd->unit, 0); + + /* Find out supported and configured values for parent (root) */ + parent = dd->pcidev->bus->self; +@@ -614,8 +613,8 @@ static int qib_tune_pcie_caps(struct qib_devdata *dd) + ep_cur = fld2val(ectl, PCI_EXP_DEVCTL_PAYLOAD); + + /* If Supported greater than limit in module param, limit it */ +- if (rc_sup > (qib_pcie_caps & 7)) +- rc_sup = qib_pcie_caps & 7; ++ if (rc_sup > (caps & 7)) ++ rc_sup = caps & 7; + /* If less than (allowed, supported), bump root payload */ + if (rc_sup > rc_cur) { + rc_cur = rc_sup; +@@ -637,8 +636,8 @@ static int qib_tune_pcie_caps(struct qib_devdata *dd) + * which is code '5' (log2(4096) - 7) + */ + rc_sup = 5; +- if (rc_sup > ((qib_pcie_caps >> 4) & 7)) +- rc_sup = (qib_pcie_caps >> 4) & 7; ++ if (rc_sup > ((caps >> 4) & 7)) ++ rc_sup = (caps >> 4) & 7; + rc_cur = fld2val(pctl, PCI_EXP_DEVCTL_READRQ); + ep_cur = fld2val(ectl, PCI_EXP_DEVCTL_READRQ); + +diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c +index 3cca55b..4208b20 100644 +--- a/drivers/infiniband/hw/qib/qib_qp.c ++++ b/drivers/infiniband/hw/qib/qib_qp.c +@@ -124,6 +124,7 @@ static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt, + enum ib_qp_type type, u8 port) + { + u32 i, offset, max_scan, qpn; ++ unsigned krcvqs; + struct qpn_map *map; + u32 ret; + +@@ -141,10 +142,11 @@ static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt, + goto bail; + } + ++ krcvqs = dd->pport[port-1].n_krcv_queues; + qpn = qpt->last + 2; + if (qpn >= QPN_MAX) + qpn = 2; +- if (qpt->mask && ((qpn & qpt->mask) >> 1) >= dd->n_krcv_queues) ++ if (qpt->mask && ((qpn & qpt->mask) >> 1) >= krcvqs) + qpn = (qpn | qpt->mask) + 2; + offset = qpn & BITS_PER_PAGE_MASK; + map = &qpt->map[qpn / BITS_PER_PAGE]; +@@ -162,7 +164,7 @@ static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt, + goto bail; + } + offset = find_next_offset(qpt, map, offset, +- dd->n_krcv_queues); ++ krcvqs); + qpn = mk_qpn(qpt, map, offset); + /* + * This test differs from alloc_pidmap(). +diff --git a/drivers/infiniband/hw/qib/qib_sdma.c b/drivers/infiniband/hw/qib/qib_sdma.c +index c6d6a54..1e08943 100644 +--- a/drivers/infiniband/hw/qib/qib_sdma.c ++++ b/drivers/infiniband/hw/qib/qib_sdma.c +@@ -532,7 +532,8 @@ static void complete_sdma_err_req(struct qib_pportdata *ppd, + */ + int qib_sdma_verbs_send(struct qib_pportdata *ppd, + struct qib_sge_state *ss, u32 dwords, +- struct qib_verbs_txreq *tx) ++ struct qib_verbs_txreq *tx, ++ struct snoop_packet *packet) + { + unsigned long flags; + struct qib_sge *sge; +@@ -543,6 +544,10 @@ int qib_sdma_verbs_send(struct qib_pportdata *ppd, + u64 sdmadesc[2]; + u32 dwoffset; + dma_addr_t addr; ++ u8 *packet_data = NULL; ++ ++ if (packet) ++ packet_data = packet->data + ((tx->hdr_dwords-2) << 2); + + spin_lock_irqsave(&ppd->sdma_lock, flags); + +@@ -599,6 +604,10 @@ retry: + dw << 2, DMA_TO_DEVICE); + if (dma_mapping_error(&ppd->dd->pcidev->dev, addr)) + goto unmap; ++ if (packet) { ++ memcpy(packet_data, sge->vaddr, len); ++ packet_data += len; ++ } + sdmadesc[0] = 0; + make_sdma_desc(ppd, sdmadesc, (u64) addr, dw, dwoffset); + /* SDmaUseLargeBuf has to be set in every descriptor */ +diff --git a/drivers/infiniband/hw/qib/qib_snoop.c b/drivers/infiniband/hw/qib/qib_snoop.c +new file mode 100644 +index 0000000..3c62bbb +--- /dev/null ++++ b/drivers/infiniband/hw/qib/qib_snoop.c +@@ -0,0 +1,970 @@ ++/* ++ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. ++ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++/* ++ * This file implements a raw read/raw write interface for snooping raw ++ * packets from the wire and injecting raw packets to the wire. ++ * ++ * Other things that this interface could do at somepoint are: ++ * - Allow packets to be injected back into the stack ++ * - Provide an intercept for packets coming from the upper layers to ++ * move them back into user-space. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include /* for ioctl constants */ ++#include ++ ++ ++#include "qib.h" ++#include "qib_verbs.h" ++#include "qib_common.h" ++#include ++ ++#define QIB_SNOOP_IOC_MAGIC IB_IOCTL_MAGIC ++#define QIB_SNOOP_IOC_BASE_SEQ 0x80 ++/* This starts our ioctl sequence ++ * numbers *way* off from the ones ++ * defined in ib_core ++ */ ++#define QIB_SNOOP_IOCGETLINKSTATE \ ++ _IO(QIB_SNOOP_IOC_MAGIC, QIB_SNOOP_IOC_BASE_SEQ) ++#define QIB_SNOOP_IOCSETLINKSTATE \ ++ _IO(QIB_SNOOP_IOC_MAGIC, QIB_SNOOP_IOC_BASE_SEQ+1) ++#define QIB_SNOOP_IOCCLEARQUEUE \ ++ _IO(QIB_SNOOP_IOC_MAGIC, QIB_SNOOP_IOC_BASE_SEQ+2) ++#define QIB_SNOOP_IOCCLEARFILTER \ ++ _IO(QIB_SNOOP_IOC_MAGIC, QIB_SNOOP_IOC_BASE_SEQ+3) ++#define QIB_SNOOP_IOCSETFILTER \ ++ _IO(QIB_SNOOP_IOC_MAGIC, QIB_SNOOP_IOC_BASE_SEQ+4) ++ ++/* local prototypes */ ++static int qib_snoop_open(struct inode *in, struct file *fp); ++static unsigned int qib_snoop_poll(struct file *fp, ++ struct poll_table_struct *wait); ++static ssize_t qib_snoop_read(struct file *fp, char __user *data, ++ size_t pkt_len, loff_t *off); ++static int qib_snoop_release(struct inode *in, struct file *fp); ++ ++static long qib_ioctl(struct file *fp, unsigned int cmd, unsigned long arg); ++ ++static ssize_t qib_snoop_write(struct file *fp, const char __user *data, ++ size_t pkt_len, loff_t *off); ++ ++#include ++ ++struct qib_packet_filter_command { ++ int opcode; ++ int length; ++ void *value_ptr; ++}; ++ ++enum qib_packet_filter_opcodes { ++ FILTER_BY_LID, ++ FILTER_BY_DLID, ++ FILTER_BY_MAD_MGMT_CLASS, ++ FILTER_BY_QP_NUMBER, ++ FILTER_BY_PKT_TYPE, ++ FILTER_BY_SERVICE_LEVEL, ++ FILTER_BY_PKEY ++}; ++ ++static const struct file_operations snoop_file_ops = { ++ .owner = THIS_MODULE, ++ .open = qib_snoop_open, ++ .read = qib_snoop_read, ++ .unlocked_ioctl = qib_ioctl, ++ .poll = qib_snoop_poll, ++ .write = qib_snoop_write, ++ .release = qib_snoop_release ++}; ++ ++struct qib_filter_array { ++ int (*filter)(void *, void *, void *); ++}; ++ ++static int qib_filter_lid(void *ibhdr, void *packet_data, void *value); ++static int qib_filter_dlid(void *ibhdr, void *packet_data, void *value); ++static int qib_filter_mad_mgmt_class(void *ibhdr, void *packet_data, ++ void *value); ++static int qib_filter_qp_number(void *ibhdr, void *packet_data, void *value); ++static int qib_filter_ibpacket_type(void *ibhdr, void *packet_data, ++ void *value); ++static int qib_filter_ib_service_level(void *ibhdr, void *packet_data, ++ void *value); ++static int qib_filter_ib_pkey(void *ibhdr, void *packet_data, void *value); ++ ++static struct qib_filter_array qib_filters[] = { ++ { qib_filter_lid }, ++ { qib_filter_dlid }, ++ { qib_filter_mad_mgmt_class }, ++ { qib_filter_qp_number }, ++ { qib_filter_ibpacket_type }, ++ { qib_filter_ib_service_level }, ++ { qib_filter_ib_pkey } ++}; ++ ++#define QIB_MAX_FILTERS ARRAY_SIZE(qib_filters) ++#define QIB_DRV_NAME "ib_qib" ++#define QIB_MAJOR 233 ++#define QIB_USER_MINOR_BASE 0 ++#define QIB_DIAG_MINOR_BASE 129 ++#define QIB_SNOOP_MINOR_BASE 160 ++#define QIB_CAPTURE_MINOR_BASE 200 ++#define QIB_NMINORS 255 ++#define PORT_BITS 2 ++#define PORT_MASK ((1U << PORT_BITS) - 1) ++#define GET_HCA(x) ((unsigned int)((x) >> PORT_BITS)) ++#define GET_PORT(x) ((unsigned int)((x) & PORT_MASK)) ++ ++int qib_snoop_add(struct qib_devdata *dd) ++{ ++ char name[32]; ++ int ret = 0; ++ int i; ++ int j; ++ int minor = 0; ++ ++ for (i = 0; i < dd->num_pports; i++) { ++ spin_lock_init(&dd->pport[i].snoop_write_lock); ++ for (j = 0; j < QIB_CHAR_DEVICES_PER_PORT; j++) { ++ spin_lock_init(&dd->pport[i].sc_device[j].snoop_lock); ++ INIT_LIST_HEAD( ++ &(dd->pport[i].sc_device[j].snoop_queue)); ++ init_waitqueue_head( ++ &dd->pport[i].sc_device[j].snoop_waitq); ++ ++ if (j == 0) { ++ minor = (((dd->unit << PORT_BITS) | i)) + ++ QIB_SNOOP_MINOR_BASE; ++ snprintf(name, sizeof(name), ++ "ipath_snoop_%02d_%02d", dd->unit, i+1); ++ } else { ++ minor = (((dd->unit << PORT_BITS) | i)) + ++ QIB_CAPTURE_MINOR_BASE; ++ snprintf(name, sizeof(name), ++ "ipath_capture_%02d_%02d", ++ dd->unit, i+1); ++ } ++ ++ ret = qib_cdev_init( ++ minor, name, ++ &snoop_file_ops, ++ &dd->pport[i].sc_device[j].snoop_cdev, ++ &dd->pport[i].sc_device[j].snoop_class_dev); ++ if (ret) ++ goto bail; ++ } ++ pr_info("qib%d: snoop dev for hca %02d enabled port %02d\n" ++ "qib%d: capture dev for hca %02d enabled port %02d\n", ++ dd->unit, dd->unit, i+1, dd->unit, dd->unit, i+1); ++ dd->pport[i].mode_flag = 0; ++ } ++out: ++ return ret; ++bail: ++ qib_dev_err(dd, "Couldn't create %s device: %d", name, ret); ++ i--; ++ if (i != dd->num_pports) { ++ for (; i >= 0 ; i--) { ++ for (j = 0; j < QIB_CHAR_DEVICES_PER_PORT; j++) ++ qib_cdev_cleanup( ++ &dd->pport[i]. ++ sc_device[j]. ++ snoop_cdev, ++ &dd->pport[i]. ++ sc_device[j]. ++ snoop_class_dev); ++ dd->pport[i].mode_flag = 0; ++ } ++ } ++ goto out; ++} ++ ++/* this must be called w/ dd->snoop_in_lock held */ ++static void drain_snoop_list(struct qib_aux_device *sc_device) ++{ ++ struct list_head *pos, *q; ++ struct snoop_packet *packet; ++ ++ list_for_each_safe(pos, q, &(sc_device->snoop_queue)) { ++ packet = list_entry(pos, struct snoop_packet, list); ++ list_del(pos); ++ kfree(packet); ++ } ++} ++ ++void qib_snoop_remove(struct qib_devdata *dd) ++{ ++ unsigned long flags = 0; ++ int i; ++ int j; ++ ++ for (i = 0; i < dd->num_pports; i++) { ++ dd->pport[i].mode_flag = 0; ++ for (j = 0; j < QIB_CHAR_DEVICES_PER_PORT; j++) { ++ spin_lock_irqsave(&dd->pport[i].sc_device[j].snoop_lock, ++ flags); ++ drain_snoop_list(&dd->pport[i].sc_device[j]); ++ qib_cdev_cleanup(&dd->pport[i].sc_device[j].snoop_cdev, ++ &dd->pport[i].sc_device[j].snoop_class_dev); ++ spin_unlock_irqrestore( ++ &dd->pport[i].sc_device[j].snoop_lock, ++ flags); ++ } ++ } ++} ++ ++static int qib_snoop_open(struct inode *in, struct file *fp) ++{ ++ int unit = iminor(in); ++ int devnum; ++ int portnum = 0; ++ int ret; ++ int mode_flag = 0; ++ unsigned long flags; ++ struct qib_devdata *dd; ++ ++ mutex_lock(&qib_mutex); ++ ++ if (unit >= QIB_CAPTURE_MINOR_BASE) { ++ unit -= QIB_CAPTURE_MINOR_BASE; ++ devnum = 1; ++ mode_flag = QIB_PORT_CAPTURE_MODE; ++ } else { ++ unit -= QIB_SNOOP_MINOR_BASE; ++ devnum = 0; ++ mode_flag = QIB_PORT_SNOOP_MODE; ++ } ++ ++ dd = qib_lookup(GET_HCA(unit)); ++ if (dd == NULL || !(dd->flags & QIB_PRESENT) || ++ !dd->kregbase) { ++ ret = -ENODEV; ++ goto bail; ++ } ++ portnum = GET_PORT(unit); ++ ++ spin_lock_irqsave(&dd->pport[portnum].sc_device[devnum].snoop_lock, ++ flags); ++ ++ if (dd->pport[portnum].mode_flag & mode_flag) { ++ ret = -EBUSY; ++ spin_unlock_irqrestore( ++ &dd->pport[portnum].sc_device[devnum].snoop_lock, ++ flags); ++ goto bail; ++ } ++ ++ drain_snoop_list(&dd->pport[portnum].sc_device[devnum]); ++ spin_unlock_irqrestore( ++ &dd->pport[portnum].sc_device[devnum].snoop_lock, flags); ++ if (devnum) ++ pr_alert("capture device for hca %02d port %02d is opened\n", ++ GET_HCA(unit), portnum+1); ++ else ++ pr_alert("snoop device for hca %02d port %02d is opened\n", ++ GET_HCA(unit), portnum+1); ++ ++ dd->pport[portnum].sc_device[devnum].pport = &dd->pport[portnum]; ++ fp->private_data = &dd->pport[portnum].sc_device[devnum]; ++ ret = 0; ++ dd->pport[portnum].mode_flag |= mode_flag; ++ ++bail: ++ mutex_unlock(&qib_mutex); ++ ++ return ret; ++} ++ ++static int qib_snoop_release(struct inode *in, struct file *fp) ++{ ++ struct qib_aux_device *sc_device = fp->private_data; ++ struct qib_pportdata *pport = sc_device->pport; ++ unsigned long flags = 0; ++ int devnum = iminor(in); ++ ++ if (devnum >= QIB_CAPTURE_MINOR_BASE) ++ devnum = 1; ++ else ++ devnum = 0; ++ ++ spin_lock_irqsave(&sc_device->snoop_lock, flags); ++ if (devnum) ++ pport->mode_flag = pport->mode_flag & (~QIB_PORT_CAPTURE_MODE); ++ else ++ pport->mode_flag = pport->mode_flag & (~QIB_PORT_SNOOP_MODE); ++ ++ drain_snoop_list(sc_device); ++ /* Clear filters before going out */ ++ pport->filter_callback = NULL; ++ kfree(pport->filter_value); ++ pport->filter_value = NULL; ++ ++ spin_unlock_irqrestore(&sc_device->snoop_lock, flags); ++ ++ if (devnum) ++ pr_alert("capture device for hca %02d port %02d is closed\n", ++ pport->dd->unit, pport->port); ++ else ++ pr_alert("snoop device for hca %02d port %02d is closed\n", ++ pport->dd->unit, pport->port); ++ ++ fp->private_data = NULL; ++ return 0; ++} ++ ++static unsigned int qib_snoop_poll(struct file *fp, ++ struct poll_table_struct *wait) ++{ ++ struct qib_aux_device *sc_device = fp->private_data; ++ int ret = 0; ++ unsigned long flags = 0; ++ ++ spin_lock_irqsave(&sc_device->snoop_lock, flags); ++ ++ poll_wait(fp, &sc_device->snoop_waitq, wait); ++ if (!list_empty(&sc_device->snoop_queue)) ++ ret |= POLLIN | POLLRDNORM; ++ ++ spin_unlock_irqrestore(&sc_device->snoop_lock, flags); ++ return ret; ++ ++} ++ ++static ssize_t qib_snoop_read(struct file *fp, char __user *data, ++ size_t pkt_len, loff_t *off) ++{ ++ struct qib_aux_device *sc_device = fp->private_data; ++ ssize_t ret = 0; ++ unsigned long flags = 0; ++ struct snoop_packet *packet = NULL; ++ ++ spin_lock_irqsave(&sc_device->snoop_lock, flags); ++ ++ while (list_empty(&sc_device->snoop_queue)) { ++ spin_unlock_irqrestore(&sc_device->snoop_lock, flags); ++ ++ if (fp->f_flags & O_NONBLOCK) ++ return -EAGAIN; ++ ++ ++ if (wait_event_interruptible(sc_device->snoop_waitq, ++ !list_empty(&sc_device->snoop_queue))) ++ return -EINTR; ++ ++ spin_lock_irqsave(&sc_device->snoop_lock, flags); ++ } ++ ++ if (!list_empty(&(sc_device->snoop_queue))) { ++ packet = list_entry(sc_device->snoop_queue.next, ++ struct snoop_packet, list); ++ list_del(&packet->list); ++ spin_unlock_irqrestore(&sc_device->snoop_lock, flags); ++ if (pkt_len >= packet->total_len) { ++ if (copy_to_user(data, packet->data, ++ packet->total_len)) ++ ret = -EFAULT; ++ else ++ ret = packet->total_len; ++ } else ++ ret = -EINVAL; ++ ++ kfree(packet); ++ } else ++ spin_unlock_irqrestore(&sc_device->snoop_lock, flags); ++ ++ return ret; ++} ++ ++static long qib_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) ++{ ++ struct qib_aux_device *sc_device = fp->private_data; ++ struct qib_pportdata *ppd = sc_device->pport; ++ struct qib_devdata *dd = ppd->dd; ++ void *filter_value = NULL; ++ long ret = 0; ++ int value = 0; ++ u8 physState = 0; ++ u8 linkState = 0; ++ u16 devState = 0; ++ unsigned long flags = 0; ++ unsigned long *argp = NULL; ++ struct qib_packet_filter_command filter_cmd = {0}; ++ ++ if (((_IOC_DIR(cmd) & _IOC_READ) ++ && !access_ok(VERIFY_WRITE, (void __user *)arg, _IOC_SIZE(cmd))) ++ || ((_IOC_DIR(cmd) & _IOC_WRITE) ++ && !access_ok(VERIFY_READ, (void __user *)arg, _IOC_SIZE(cmd)))) { ++ ret = -EFAULT; ++ } else if (!capable(CAP_SYS_ADMIN)) { ++ ret = -EPERM; ++ } else if (sc_device != (&ppd->sc_device[QIB_SNOOP_DEV_INDEX]) ++ && cmd != QIB_SNOOP_IOCCLEARQUEUE ++ && cmd != QIB_SNOOP_IOCCLEARFILTER ++ && cmd != QIB_SNOOP_IOCSETFILTER) { ++ /* Capture devices are allowed only 3 operations ++ * 1.Clear capture queue ++ * 2.Clear capture filter ++ * 3.Set capture filter ++ * Other are invalid. ++ */ ++ ret = -EINVAL; ++ } else { ++ switch (cmd) { ++ case QIB_SNOOP_IOCSETLINKSTATE: ++ ret = __get_user(value, (int __user *) arg); ++ if (ret != 0) ++ break; ++ ++ physState = (value >> 4) & 0xF; ++ linkState = value & 0xF; ++ ++ switch (linkState) { ++ case IB_PORT_NOP: ++ if (physState == 0) ++ break; ++ /* fall through */ ++ case IB_PORT_DOWN: ++ switch (physState) { ++ case 0: ++ if (dd->f_ibphys_portstate && ++ (dd->f_ibphys_portstate(ppd->lastibcstat) ++ & 0xF & IB_PHYSPORTSTATE_SLEEP)) ++ devState = ++ QIB_IB_LINKDOWN_SLEEP; ++ else ++ devState = ++ QIB_IB_LINKDOWN; ++ break; ++ case 1: ++ devState = QIB_IB_LINKDOWN_SLEEP; ++ break; ++ case 2: ++ devState = QIB_IB_LINKDOWN; ++ break; ++ case 3: ++ devState = QIB_IB_LINKDOWN_DISABLE; ++ break; ++ default: ++ ret = -EINVAL; ++ goto done; ++ break; ++ } ++ ret = qib_set_linkstate(ppd, devState); ++ break; ++ case IB_PORT_ARMED: ++ if (!(dd->flags & ++ (QIB_IB_LINKARM | QIB_IB_LINKACTIVE))) { ++ ret = -EINVAL; ++ break; ++ } ++ ret = qib_set_linkstate(ppd, QIB_IB_LINKARM); ++ break; ++ case IB_PORT_ACTIVE: ++ if (!(dd->flags & QIB_IB_LINKARM)) { ++ ret = -EINVAL; ++ break; ++ } ++ ret = qib_set_linkstate(ppd, QIB_IB_LINKACTIVE); ++ break; ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ ++ if (ret) ++ break; ++ /* fall through */ ++ ++ case QIB_SNOOP_IOCGETLINKSTATE: ++ value = dd->f_ibphys_portstate(ppd->lastibcstat); ++ value <<= 4; ++ value |= dd->f_iblink_state(ppd->lastibcstat); ++ ret = __put_user(value, (int __user *)arg); ++ break; ++ ++ case QIB_SNOOP_IOCCLEARQUEUE: ++ spin_lock_irqsave(&sc_device->snoop_lock, flags); ++ drain_snoop_list(sc_device); ++ spin_unlock_irqrestore(&sc_device->snoop_lock, flags); ++ break; ++ ++ case QIB_SNOOP_IOCCLEARFILTER: ++ spin_lock_irqsave(&sc_device->snoop_lock, flags); ++ if (ppd->filter_callback) { ++ /* Drain packets first */ ++ drain_snoop_list(sc_device); ++ ppd->filter_callback = NULL; ++ } ++ kfree(ppd->filter_value); ++ ppd->filter_value = NULL; ++ spin_unlock_irqrestore(&sc_device->snoop_lock, flags); ++ break; ++ ++ case QIB_SNOOP_IOCSETFILTER: ++ /* just copy command structure */ ++ argp = (unsigned long *)arg; ++ ret = copy_from_user(&filter_cmd, (u8 *)argp, ++ sizeof(filter_cmd)); ++ if (ret < 0) { ++ pr_alert("Error copying filter command\n"); ++ break; ++ } ++ if (filter_cmd.opcode >= QIB_MAX_FILTERS) { ++ pr_alert("Invalid opcode in request\n"); ++ ret = -EINVAL; ++ break; ++ } ++ filter_value = kzalloc( ++ filter_cmd.length * sizeof(u8), ++ GFP_KERNEL); ++ if (!filter_value) { ++ pr_alert("Not enough memory\n"); ++ ret = -ENOMEM; ++ break; ++ } ++ /* copy remaining data from userspace */ ++ ret = copy_from_user((u8 *)filter_value, ++ (u8 *)filter_cmd.value_ptr, ++ filter_cmd.length); ++ if (ret < 0) { ++ kfree(filter_value); ++ pr_alert("Error copying filter data\n"); ++ break; ++ } ++ /* Drain packets first */ ++ spin_lock_irqsave(&sc_device->snoop_lock, flags); ++ drain_snoop_list(sc_device); ++ ppd->filter_callback = ++ qib_filters[filter_cmd.opcode].filter; ++ /* just in case we see back to back sets */ ++ kfree(ppd->filter_value); ++ ppd->filter_value = filter_value; ++ spin_unlock_irqrestore(&sc_device->snoop_lock, flags); ++ break; ++ ++ default: ++ ret = -ENOTTY; ++ break; ++ } ++ } ++done: ++ return ret; ++} ++ ++ ++static ssize_t qib_pio_send_pkt(struct qib_pportdata *ppd, ++ u32 *data, u32 pkt_len) ++{ ++ int i; ++ u64 pbc; ++ u32 __iomem *piobuf; ++ u32 pnum, control, len; ++ struct qib_devdata *dd = ppd->dd; ++ u32 dwords = pkt_len >> 2; ++ unsigned long flags; ++ ssize_t ret = -EINVAL; ++ ++ i = 0; ++ len = dwords + 1; ++ control = dd->f_setpbc_control(ppd, len, 0, ++ (((u8 *)data)[0] >> 4) & 0xf); ++ pbc = ((u64) control << 32) | len; ++ while (!(piobuf = dd->f_getsendbuf(ppd, pbc, &pnum))) { ++ if (i > 15) { ++ ret = -ENOMEM; ++ goto Err; ++ } ++ i++; ++ /* lets try to flush all of it */ ++ dd->f_sendctrl(ppd, QIB_SENDCTRL_DISARM_ALL); ++ udelay(100); ++ } ++ spin_lock_irqsave(&ppd->snoop_write_lock, flags); ++ /* disable header check on this packet, since it can't be valid */ ++ dd->f_txchk_change(dd, pnum, 1, TXCHK_CHG_TYPE_DIS1, NULL); ++ writeq(pbc, piobuf); ++ qib_flush_wc(); ++ if (dd->flags & QIB_PIO_FLUSH_WC) { ++ qib_flush_wc(); ++ qib_pio_copy(piobuf + 2, data, dwords - 1); ++ qib_flush_wc(); ++ __raw_writel(data[dwords - 1], piobuf + dwords + 1); ++ } else ++ qib_pio_copy(piobuf + 2, data, dwords); ++ if (dd->flags & QIB_USE_SPCL_TRIG) { ++ u32 spcl_off = (pnum >= dd->piobcnt2k) ? 2047 : 1023; ++ ++ qib_flush_wc(); ++ __raw_writel(0xaebecede, piobuf + spcl_off); ++ } ++ qib_sendbuf_done(dd, pnum); ++ qib_flush_wc(); ++ /* and re-enable hdr check */ ++ dd->f_txchk_change(dd, pnum, 1, TXCHK_CHG_TYPE_ENAB1, NULL); ++ spin_unlock_irqrestore(&ppd->snoop_write_lock, flags); ++ ret = pkt_len; ++Err: ++ return ret; ++} ++ ++ ++static ssize_t qib_snoop_write(struct file *fp, const char __user *data, ++ size_t pkt_len, loff_t *off) ++{ ++ struct qib_aux_device *sc_device = fp->private_data; ++ struct qib_pportdata *ppd = sc_device->pport; ++ struct qib_devdata *dd = ppd->dd; ++ ssize_t ret = 0; ++ u32 *buffer = NULL; ++ u32 plen, clen; ++ ++ /* capture device should not be entertaining writes */ ++ if (sc_device != (&ppd->sc_device[QIB_SNOOP_DEV_INDEX])) { ++ ret = -EINVAL; ++ goto bail; ++ } ++ ++ if (pkt_len == 0) ++ goto bail; ++ ++ if (pkt_len & 3) { ++ ret = -EINVAL; ++ goto bail; ++ } ++ ++ clen = pkt_len >> 2; ++ ++ if (!dd || !(dd->flags & QIB_PRESENT) || ++ !dd->kregbase) { ++ ret = -ENODEV; ++ goto bail; ++ } ++ ++ if (!(dd->flags & QIB_INITTED)) { ++ /* no hardware, freeze, etc. */ ++ ret = -ENODEV; ++ goto bail; ++ } ++ ++ plen = sizeof(u32) + pkt_len; ++ ++ if ((plen + 4) > ppd->ibmaxlen) { ++ ret = -EINVAL; ++ goto bail; /* before writing pbc */ ++ } ++ ++ buffer = vmalloc(plen); ++ if (!buffer) { ++ ret = -ENOMEM; ++ goto bail; ++ } ++ if (copy_from_user(buffer, ++ (const void __user *) (unsigned long) data, pkt_len)) { ++ ret = -EFAULT; ++ goto bail; ++ } ++ ++ ret = qib_pio_send_pkt(ppd, buffer, pkt_len); ++ ++bail: ++ vfree(buffer); ++ ++ return ret; ++} ++ ++int snoop_get_header_size(struct qib_devdata *dd, ++ struct qib_ib_header *hdr, ++ void *data, u32 tlen) ++{ ++ int lnh, header_size = -1; ++ u8 opcode, opcode_major; ++ struct qib_other_headers *ohdr; ++ ++ lnh = (be16_to_cpu(hdr->lrh[0]) & 3); ++ ++ if (lnh == QIB_LRH_BTH) ++ ohdr = &hdr->u.oth; ++ else if (lnh == QIB_LRH_GRH) ++ ohdr = &hdr->u.l.oth; ++ else ++ goto bail; ++ ++ opcode = be32_to_cpu(ohdr->bth[0]) >> 24; ++ ++ opcode_major = (opcode >> 5) & 0x7; ++ ++ switch (opcode_major) { ++ case 0x03: /* UD */ ++ if (lnh == QIB_LRH_BTH) ++ header_size = 8 + 12 + 8 /* LRH + BTH + DETH */; ++ else if (lnh == QIB_LRH_GRH) { ++ ++ /* LRH + GRH + BTH + DETH */; ++ header_size = 8 + 40 + 12 + 8; ++ /* Some of the header data is in the data segment */ ++ if (dd->rcvhdrentsize == 16) ++ header_size -= 12; ++ } else ++ header_size = -1; ++ ++ break; ++ case 0x0: /* RC */ ++ case 0x1: /* UC */ ++ case 0x2: /* RD */ ++ default: ++ header_size = -1; ++ break; ++ } ++ ++bail: ++ return header_size; ++} ++ ++static void qib_snoop_list_add_tail(struct snoop_packet *packet, ++ struct qib_pportdata *ppd, ++ int dev_index) ++{ ++ unsigned long flags = 0; ++ struct qib_aux_device *sc_device = &ppd->sc_device[dev_index]; ++ ++ spin_lock_irqsave(&sc_device->snoop_lock, flags); ++ if (likely((dev_index == QIB_CAPTURE_DEV_INDEX && ++ (ppd->mode_flag & QIB_PORT_CAPTURE_MODE)) || ++ (dev_index == QIB_SNOOP_DEV_INDEX && ++ (ppd->mode_flag & QIB_PORT_SNOOP_MODE)))) ++ list_add_tail(&(packet->list), &sc_device->snoop_queue); ++ spin_unlock_irqrestore(&sc_device->snoop_lock, flags); ++ wake_up_interruptible(&sc_device->snoop_waitq); ++} ++ ++void qib_snoop_send_queue_packet(struct qib_pportdata *ppd, ++ struct snoop_packet *packet) ++{ ++ /* If we are dealing with mix mode then we need to make another copy ++ * of same packet and queue it in snoop device as well. ++ * However if we do not get sufficient memory here then we just ++ * add packet to capture queue by default so that we atleast have one ++ * packet with us in capture queue. ++ */ ++ if (unlikely(ppd->mode_flag == ++ (QIB_PORT_SNOOP_MODE | QIB_PORT_CAPTURE_MODE))) { ++ struct snoop_packet *pcopy; ++ pcopy = kmalloc(sizeof(*pcopy) + packet->total_len, GFP_ATOMIC); ++ if (pcopy != NULL) { ++ memcpy(pcopy, packet, ++ packet->total_len + sizeof(*pcopy)); ++ qib_snoop_list_add_tail(pcopy, ppd, ++ QIB_SNOOP_DEV_INDEX); ++ } ++ qib_snoop_list_add_tail(packet, ppd, QIB_CAPTURE_DEV_INDEX); ++ } else if (ppd->mode_flag == QIB_PORT_CAPTURE_MODE) ++ qib_snoop_list_add_tail(packet, ppd, QIB_CAPTURE_DEV_INDEX); ++ else if (ppd->mode_flag == QIB_PORT_SNOOP_MODE) ++ qib_snoop_list_add_tail(packet, ppd, QIB_SNOOP_DEV_INDEX); ++} ++ ++/* ++ * qib_snoop_rcv_queue_packet - receive a packet for snoop interface ++ * @port - Hca port on which this packet is received. ++ * @rhdr - Packet header ++ * @data - Packet data/payloaa ++ * @tlen - total length of packet including header and payload. ++ * ++ * Called on for every packet received when snooping/mix mode is turned on ++ * Copies received packet to internal buffer and appends it to ++ * packet list. ++ * ++ * Returns, ++ * 0 if this packet needs to be forwarded by driver ++ * 1 if this packet needs to be dropped by driver ++ */ ++ ++int qib_snoop_rcv_queue_packet(struct qib_pportdata *port, void *rhdr, ++ void *data, u32 tlen) ++{ ++ int header_size = 0; ++ struct qib_ib_header *hdr = rhdr; ++ struct snoop_packet *packet = NULL; ++ ++ header_size = snoop_get_header_size(port->dd, hdr, data, tlen); ++ if (header_size <= 0) ++ return 0; ++ ++ /* qib_snoop_send_queue_packet takes care or mix mode, ++ * so just return from here. ++ */ ++ if (port->mode_flag == (QIB_PORT_SNOOP_MODE | QIB_PORT_CAPTURE_MODE)) ++ return 0; ++ ++ packet = kmalloc(sizeof(struct snoop_packet) + tlen, ++ GFP_ATOMIC); ++ if (likely(packet)) { ++ memcpy(packet->data, rhdr, header_size); ++ memcpy(packet->data + header_size, data, ++ tlen - header_size); ++ packet->total_len = tlen; ++ qib_snoop_list_add_tail(packet, port, QIB_SNOOP_DEV_INDEX); ++ return 1; ++ } ++ ++ return 0; ++} ++ ++static int qib_filter_lid(void *ibhdr, void *packet_data, void *value) ++{ ++ struct qib_ib_header *hdr = (struct qib_ib_header *)ibhdr; ++ if (*((u16 *)value) == be16_to_cpu(hdr->lrh[3])) ++ return 0; /* matched */ ++ return 1; /* Not matched */ ++} ++ ++static int qib_filter_dlid(void *ibhdr, void *packet_data, void *value) ++{ ++ struct qib_ib_header *hdr = (struct qib_ib_header *)ibhdr; ++ if (*((u16 *)value) == be16_to_cpu(hdr->lrh[1])) ++ return 0; ++ return 1; ++} ++ ++static int qib_filter_mad_mgmt_class(void *ibhdr, void *packet_data, ++ void *value) ++{ ++ struct qib_ib_header *hdr = (struct qib_ib_header *)ibhdr; ++ struct qib_other_headers *ohdr = NULL; ++ struct ib_smp *smp = NULL; ++ u32 qpn = 0; ++ ++ /* packet_data could be null if only header is captured */ ++ if (packet_data == NULL) ++ return 1; ++ /* Check for GRH */ ++ if ((be16_to_cpu(hdr->lrh[0]) & 3) == QIB_LRH_BTH) ++ ohdr = &hdr->u.oth; /* LRH + BTH + DETH */ ++ else ++ ohdr = &hdr->u.l.oth; /* LRH + GRH + BTH + DETH */ ++ qpn = be32_to_cpu(ohdr->bth[1]) & 0x00FFFFFF; ++ if (qpn <= 1) { ++ smp = (struct ib_smp *)packet_data; ++ if (*((u8 *)value) == smp->mgmt_class) ++ return 0; ++ else ++ return 1; ++ } ++ return 1; ++} ++ ++static int qib_filter_qp_number(void *ibhdr, void *packet_data, void *value) ++{ ++ ++ struct qib_ib_header *hdr = (struct qib_ib_header *)ibhdr; ++ struct qib_other_headers *ohdr = NULL; ++ ++ /* Check for GRH */ ++ if ((be16_to_cpu(hdr->lrh[0]) & 3) == QIB_LRH_BTH) ++ ohdr = &hdr->u.oth; /* LRH + BTH + DETH */ ++ else ++ ohdr = &hdr->u.l.oth; /* LRH + GRH + BTH + DETH */ ++ if (*((u32 *)value) == (be32_to_cpu(ohdr->bth[1]) & 0x00FFFFFF)) ++ return 0; ++ return 1; ++} ++ ++ ++static int qib_filter_ibpacket_type(void *ibhdr, void *packet_data, ++ void *value) ++{ ++ u32 lnh = 0; ++ u8 opcode = 0; ++ struct qib_ib_header *hdr = (struct qib_ib_header *)ibhdr; ++ struct qib_other_headers *ohdr = NULL; ++ ++ lnh = (be16_to_cpu(hdr->lrh[0]) & 3); ++ ++ if (lnh == QIB_LRH_BTH) ++ ohdr = &hdr->u.oth; ++ else if (lnh == QIB_LRH_GRH) ++ ohdr = &hdr->u.l.oth; ++ else ++ return 1; ++ ++ opcode = be32_to_cpu(ohdr->bth[0]) >> 24; ++ ++ if (*((u8 *)value) == ((opcode >> 5) & 0x7)) ++ return 0; ++ return 1; ++} ++ ++static int qib_filter_ib_service_level(void *ibhdr, void *packet_data, ++ void *value) ++{ ++ struct qib_ib_header *hdr = (struct qib_ib_header *)ibhdr; ++ ++ if ((*((u8 *)value)) == (be16_to_cpu(hdr->lrh[0] >> 4) & 0xF)) ++ return 0; ++ return 1; ++} ++ ++static int qib_filter_ib_pkey(void *ibhdr, void *packet_data, void *value) ++{ ++ ++ u32 lnh = 0; ++ struct qib_ib_header *hdr = (struct qib_ib_header *)ibhdr; ++ struct qib_other_headers *ohdr = NULL; ++ ++ lnh = (be16_to_cpu(hdr->lrh[0]) & 3); ++ if (lnh == QIB_LRH_BTH) ++ ohdr = &hdr->u.oth; ++ else if (lnh == QIB_LRH_GRH) ++ ohdr = &hdr->u.l.oth; ++ else ++ return 1; ++ ++ /* P_key is 16-bit entity, however top most bit indicates ++ * type of membership. 0 for limited and 1 for Full. ++ * Limited members cannot accept information from other ++ * Limited members, but communication is allowed between ++ * every other combination of membership. ++ * Hence we'll omitt comparing top-most bit while filtering ++ */ ++ ++ if ((*(u16 *)value & 0x7FFF) == ++ ((be32_to_cpu(ohdr->bth[0])) & 0x7FFF)) ++ return 0; ++ return 1; ++} +diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c +index d0a0ea0..a98635d 100644 +--- a/drivers/infiniband/hw/qib/qib_user_sdma.c ++++ b/drivers/infiniband/hw/qib/qib_user_sdma.c +@@ -1,4 +1,5 @@ + /* ++ * Copyright (c) 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two +@@ -52,83 +53,65 @@ + /* attempt to drain the queue for 5secs */ + #define QIB_USER_SDMA_DRAIN_TIMEOUT 500 + +-struct qib_user_sdma_pkt { +- struct list_head list; /* list element */ +- +- u8 tiddma; /* if this is NEW tid-sdma */ +- u8 largepkt; /* this is large pkt from kmalloc */ +- u16 frag_size; /* frag size used by PSM */ +- u16 index; /* last header index or push index */ +- u16 naddr; /* dimension of addr (1..3) ... */ +- u16 addrlimit; /* addr array size */ +- u16 tidsmidx; /* current tidsm index */ +- u16 tidsmcount; /* tidsm array item count */ +- u16 payload_size; /* payload size so far for header */ +- u32 bytes_togo; /* bytes for processing */ +- u32 counter; /* sdma pkts queued counter for this entry */ +- struct qib_tid_session_member *tidsm; /* tid session member array */ +- struct qib_user_sdma_queue *pq; /* which pq this pkt belongs to */ +- u64 added; /* global descq number of entries */ +- +- struct { +- u16 offset; /* offset for kvaddr, addr */ +- u16 length; /* length in page */ +- u16 first_desc; /* first desc */ +- u16 last_desc; /* last desc */ +- u16 put_page; /* should we put_page? */ +- u16 dma_mapped; /* is page dma_mapped? */ +- u16 dma_length; /* for dma_unmap_page() */ +- u16 padding; +- struct page *page; /* may be NULL (coherent mem) */ +- void *kvaddr; /* FIXME: only for pio hack */ +- dma_addr_t addr; +- } addr[4]; /* max pages, any more and we coalesce */ ++/* ++ * track how many times a process open this driver. ++ */ ++struct rb_root qib_user_sdma_rb_root = RB_ROOT; ++ ++struct qib_user_sdma_rb_node { ++ struct rb_node node; ++ int refcount; ++ pid_t pid; + }; + +-struct qib_user_sdma_queue { +- /* +- * pkts sent to dma engine are queued on this +- * list head. the type of the elements of this +- * list are struct qib_user_sdma_pkt... +- */ +- struct list_head sent; ++static struct qib_user_sdma_rb_node * ++qib_user_sdma_rb_search(struct rb_root *root, pid_t pid) ++{ ++ struct qib_user_sdma_rb_node *sdma_rb_node; ++ struct rb_node *node = root->rb_node; ++ ++ while (node) { ++ sdma_rb_node = container_of(node, ++ struct qib_user_sdma_rb_node, node); ++ if (pid < sdma_rb_node->pid) ++ node = node->rb_left; ++ else if (pid > sdma_rb_node->pid) ++ node = node->rb_right; ++ else ++ return sdma_rb_node; ++ } ++ return NULL; ++} + +- /* +- * Because above list will be accessed by both process and +- * signal handler, we need a spinlock for it. +- */ +- spinlock_t sent_lock ____cacheline_aligned_in_smp; +- +- /* headers with expected length are allocated from here... */ +- char header_cache_name[64]; +- struct dma_pool *header_cache; +- +- /* packets are allocated from the slab cache... */ +- char pkt_slab_name[64]; +- struct kmem_cache *pkt_slab; +- +- /* as packets go on the queued queue, they are counted... */ +- u32 counter; +- u32 sent_counter; +- /* pending packets, not sending yet */ +- u32 num_pending; +- /* sending packets, not complete yet */ +- u32 num_sending; +- /* global descq number of entry of last sending packet */ +- u64 added; +- +- /* dma page table */ +- struct rb_root dma_pages_root; +- +- /* protect everything above... */ +- struct mutex lock; +-}; ++static int ++qib_user_sdma_rb_insert(struct rb_root *root, struct qib_user_sdma_rb_node *new) ++{ ++ struct rb_node **node = &(root->rb_node); ++ struct rb_node *parent = NULL; ++ struct qib_user_sdma_rb_node *got; ++ ++ while (*node) { ++ got = container_of(*node, struct qib_user_sdma_rb_node, node); ++ parent = *node; ++ if (new->pid < got->pid) ++ node = &((*node)->rb_left); ++ else if (new->pid > got->pid) ++ node = &((*node)->rb_right); ++ else ++ return 0; ++ } ++ ++ rb_link_node(&new->node, parent, node); ++ rb_insert_color(&new->node, root); ++ return 1; ++} + + struct qib_user_sdma_queue * + qib_user_sdma_queue_create(struct device *dev, int unit, int ctxt, int sctxt) + { + struct qib_user_sdma_queue *pq = + kmalloc(sizeof(struct qib_user_sdma_queue), GFP_KERNEL); ++ struct qib_user_sdma_rb_node *sdma_rb_node; + + if (!pq) + goto done; +@@ -138,6 +121,7 @@ qib_user_sdma_queue_create(struct device *dev, int unit, int ctxt, int sctxt) + pq->num_pending = 0; + pq->num_sending = 0; + pq->added = 0; ++ pq->sdma_rb_node = NULL; + + INIT_LIST_HEAD(&pq->sent); + spin_lock_init(&pq->sent_lock); +@@ -163,8 +147,30 @@ qib_user_sdma_queue_create(struct device *dev, int unit, int ctxt, int sctxt) + + pq->dma_pages_root = RB_ROOT; + ++ sdma_rb_node = qib_user_sdma_rb_search(&qib_user_sdma_rb_root, ++ current->pid); ++ if (sdma_rb_node) { ++ sdma_rb_node->refcount++; ++ } else { ++ int ret; ++ sdma_rb_node = kmalloc(sizeof( ++ struct qib_user_sdma_rb_node), GFP_KERNEL); ++ if (!sdma_rb_node) ++ goto err_rb; ++ ++ sdma_rb_node->refcount = 1; ++ sdma_rb_node->pid = current->pid; ++ ++ ret = qib_user_sdma_rb_insert(&qib_user_sdma_rb_root, ++ sdma_rb_node); ++ BUG_ON(ret == 0); ++ } ++ pq->sdma_rb_node = sdma_rb_node; ++ + goto done; + ++err_rb: ++ dma_pool_destroy(pq->header_cache); + err_slab: + kmem_cache_destroy(pq->pkt_slab); + err_kfree: +@@ -175,12 +181,12 @@ done: + return pq; + } + +-static void qib_user_sdma_init_frag(struct qib_user_sdma_pkt *pkt, +- int i, u16 offset, u16 len, +- u16 first_desc, u16 last_desc, +- u16 put_page, u16 dma_mapped, +- struct page *page, void *kvaddr, +- dma_addr_t dma_addr, u16 dma_length) ++void qib_user_sdma_init_frag(struct qib_user_sdma_pkt *pkt, ++ int i, u16 offset, u16 len, ++ u16 first_desc, u16 last_desc, ++ u16 put_page, u16 dma_mapped, ++ struct page *page, void *kvaddr, ++ dma_addr_t dma_addr, u16 dma_length) + { + pkt->addr[i].offset = offset; + pkt->addr[i].length = len; +@@ -194,7 +200,7 @@ static void qib_user_sdma_init_frag(struct qib_user_sdma_pkt *pkt, + pkt->addr[i].dma_length = dma_length; + } + +-static void *qib_user_sdma_alloc_header(struct qib_user_sdma_queue *pq, ++void *qib_user_sdma_alloc_header(struct qib_user_sdma_queue *pq, + size_t len, dma_addr_t *dma_addr) + { + void *hdr; +@@ -216,11 +222,11 @@ static void *qib_user_sdma_alloc_header(struct qib_user_sdma_queue *pq, + return hdr; + } + +-static int qib_user_sdma_page_to_frags(const struct qib_devdata *dd, +- struct qib_user_sdma_queue *pq, +- struct qib_user_sdma_pkt *pkt, +- struct page *page, u16 put, +- u16 offset, u16 len, void *kvaddr) ++int qib_user_sdma_page_to_frags(const struct qib_devdata *dd, ++ struct qib_user_sdma_queue *pq, ++ struct qib_user_sdma_pkt *pkt, ++ struct page *page, u16 put, ++ u16 offset, u16 len, void *kvaddr) + { + __le16 *pbc16; + void *pbcvaddr; +@@ -235,21 +241,27 @@ static int qib_user_sdma_page_to_frags(const struct qib_devdata *dd, + int ret = 0; + + if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) { +- /* +- * dma mapping error, pkt has not managed +- * this page yet, return the page here so +- * the caller can ignore this page. +- */ +- if (put) { +- put_page(page); +- } else { +- /* coalesce case */ +- kunmap(page); +- __free_page(page); ++#ifdef QIB_CONFIG_KNX ++ if (!pkt->remote) { ++#endif ++ /* ++ * dma mapping error, pkt has not managed ++ * this page yet, return the page here so ++ * the caller can ignore this page. ++ */ ++ if (put) { ++ put_page(page); ++ } else { ++ /* coalesce case */ ++ kunmap(page); ++ __free_page(page); ++ } ++ ret = -ENOMEM; ++ goto done; + } +- ret = -ENOMEM; +- goto done; ++#ifdef QIB_CONFIG_KNX + } ++#endif + offset = 0; + dma_mapped = 1; + +@@ -551,13 +563,19 @@ static void qib_user_sdma_free_pkt_frag(struct device *dev, + pkt->addr[i].dma_length, + DMA_TO_DEVICE); + +- if (pkt->addr[i].kvaddr) +- kunmap(pkt->addr[i].page); ++#ifdef QIB_CONFIG_KNX ++ if (!pkt->remote) { ++#endif ++ if (pkt->addr[i].kvaddr) ++ kunmap(pkt->addr[i].page); + +- if (pkt->addr[i].put_page) +- put_page(pkt->addr[i].page); +- else +- __free_page(pkt->addr[i].page); ++ if (pkt->addr[i].put_page) ++ put_page(pkt->addr[i].page); ++ else ++ __free_page(pkt->addr[i].page); ++#ifdef QIB_CONFIG_KNX ++ } ++#endif + } else if (pkt->addr[i].kvaddr) { + /* for headers */ + if (pkt->addr[i].dma_mapped) { +@@ -697,9 +715,9 @@ static int qib_user_sdma_init_payload(const struct qib_devdata *dd, + } + + /* free a packet list -- return counter value of last packet */ +-static void qib_user_sdma_free_pkt_list(struct device *dev, +- struct qib_user_sdma_queue *pq, +- struct list_head *list) ++void qib_user_sdma_free_pkt_list(struct device *dev, ++ struct qib_user_sdma_queue *pq, ++ struct list_head *list) + { + struct qib_user_sdma_pkt *pkt, *pkt_next; + +@@ -709,6 +727,10 @@ static void qib_user_sdma_free_pkt_list(struct device *dev, + for (i = 0; i < pkt->naddr; i++) + qib_user_sdma_free_pkt_frag(dev, pq, pkt, i); + ++#ifdef QIB_CONFIG_KNX ++ if (pkt->remote) ++ qib_knx_sdma_free_pkt(pkt); ++#endif + if (pkt->largepkt) + kfree(pkt); + else +@@ -892,6 +914,7 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd, + pkt->payload_size = 0; + pkt->counter = counter; + pkt->tiddma = tiddma; ++ pkt->remote = 0; + + /* setup the first header */ + qib_user_sdma_init_frag(pkt, 0, /* index */ +@@ -967,8 +990,8 @@ static void qib_user_sdma_set_complete_counter(struct qib_user_sdma_queue *pq, + } + + /* try to clean out queue -- needs pq->lock */ +-static int qib_user_sdma_queue_clean(struct qib_pportdata *ppd, +- struct qib_user_sdma_queue *pq) ++int qib_user_sdma_queue_clean(struct qib_pportdata *ppd, ++ struct qib_user_sdma_queue *pq) + { + struct qib_devdata *dd = ppd->dd; + struct list_head free_list; +@@ -1021,13 +1044,18 @@ void qib_user_sdma_queue_destroy(struct qib_user_sdma_queue *pq) + if (!pq) + return; + +- kmem_cache_destroy(pq->pkt_slab); ++ pq->sdma_rb_node->refcount--; ++ if (pq->sdma_rb_node->refcount == 0) { ++ rb_erase(&pq->sdma_rb_node->node, &qib_user_sdma_rb_root); ++ kfree(pq->sdma_rb_node); ++ } + dma_pool_destroy(pq->header_cache); ++ kmem_cache_destroy(pq->pkt_slab); + kfree(pq); + } + + /* clean descriptor queue, returns > 0 if some elements cleaned */ +-static int qib_user_sdma_hwqueue_clean(struct qib_pportdata *ppd) ++int qib_user_sdma_hwqueue_clean(struct qib_pportdata *ppd) + { + int ret; + unsigned long flags; +@@ -1238,30 +1266,56 @@ retry: + } + + /* pq->lock must be held, get packets on the wire... */ +-static int qib_user_sdma_push_pkts(struct qib_pportdata *ppd, +- struct qib_user_sdma_queue *pq, +- struct list_head *pktlist, int count) ++int qib_user_sdma_push_pkts(struct qib_pportdata *ppd, ++ struct qib_user_sdma_queue *pq, ++ struct list_head *pktlist, int count) + { +- int ret = 0; + unsigned long flags; + + if (unlikely(!(ppd->lflags & QIBL_LINKACTIVE))) + return -ECOMM; + +- spin_lock_irqsave(&ppd->sdma_lock, flags); +- +- if (unlikely(!__qib_sdma_running(ppd))) { +- ret = -ECOMM; +- goto unlock; ++ /* non-blocking mode */ ++ if (pq->sdma_rb_node->refcount > 1) { ++ spin_lock_irqsave(&ppd->sdma_lock, flags); ++ if (unlikely(!__qib_sdma_running(ppd))) { ++ spin_unlock_irqrestore(&ppd->sdma_lock, flags); ++ return -ECOMM; ++ } ++ pq->num_pending += count; ++ list_splice_tail_init(pktlist, &ppd->sdma_userpending); ++ qib_user_sdma_send_desc(ppd, &ppd->sdma_userpending); ++ spin_unlock_irqrestore(&ppd->sdma_lock, flags); ++ return 0; + } + ++ /* In this case, descriptors from this process are not ++ * linked to ppd pending queue, interrupt handler ++ * won't update this process, it is OK to directly ++ * modify without sdma lock. ++ */ ++ ++ + pq->num_pending += count; +- list_splice_tail_init(pktlist, &ppd->sdma_userpending); +- qib_user_sdma_send_desc(ppd, &ppd->sdma_userpending); ++ /* ++ * Blocking mode for single rail process, we must ++ * release/regain sdma_lock to give other process ++ * chance to make progress. This is important for ++ * performance. ++ */ ++ do { ++ spin_lock_irqsave(&ppd->sdma_lock, flags); ++ if (unlikely(!__qib_sdma_running(ppd))) { ++ spin_unlock_irqrestore(&ppd->sdma_lock, flags); ++ return -ECOMM; ++ } ++ qib_user_sdma_send_desc(ppd, pktlist); ++ if (!list_empty(pktlist)) ++ qib_sdma_make_progress(ppd); ++ spin_unlock_irqrestore(&ppd->sdma_lock, flags); ++ } while (!list_empty(pktlist)); + +-unlock: +- spin_unlock_irqrestore(&ppd->sdma_lock, flags); +- return ret; ++ return 0; + } + + int qib_user_sdma_writev(struct qib_ctxtdata *rcd, +@@ -1291,7 +1345,7 @@ int qib_user_sdma_writev(struct qib_ctxtdata *rcd, + qib_user_sdma_queue_clean(ppd, pq); + + while (dim) { +- int mxp = 8; ++ int mxp = 1; + int ndesc = 0; + + down_write(¤t->mm->mmap_sem); +diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.h b/drivers/infiniband/hw/qib/qib_user_sdma.h +index ce8cbaf..93ce40b 100644 +--- a/drivers/infiniband/hw/qib/qib_user_sdma.h ++++ b/drivers/infiniband/hw/qib/qib_user_sdma.h +@@ -31,12 +31,108 @@ + */ + #include + +-struct qib_user_sdma_queue; ++struct qib_user_sdma_pkt { ++ struct list_head list; /* list element */ ++ ++ u8 tiddma; /* if this is NEW tid-sdma */ ++ u8 largepkt; /* this is large pkt from kmalloc */ ++ u16 frag_size; /* frag size used by PSM */ ++ u16 index; /* last header index or push index */ ++ u16 naddr; /* dimension of addr (1..3) ... */ ++ u16 addrlimit; /* addr array size */ ++ u16 tidsmidx; /* current tidsm index */ ++ u16 tidsmcount; /* tidsm array item count */ ++ u16 payload_size; /* payload size so far for header */ ++ u32 bytes_togo; /* bytes for processing */ ++ u32 counter; /* sdma pkts queued counter for this entry */ ++ struct qib_tid_session_member *tidsm; /* tid session member array */ ++ struct qib_user_sdma_queue *pq; /* which pq this pkt belongs to */ ++ u64 added; /* global descq number of entries */ ++#ifdef QIB_CONFIG_KNX ++ u64 remote; /* does the packet original on the host */ ++#endif ++ ++ struct { ++ u16 offset; /* offset for kvaddr, addr */ ++ u16 length; /* length in page */ ++ u16 first_desc; /* first desc */ ++ u16 last_desc; /* last desc */ ++ u16 put_page; /* should we put_page? */ ++ u16 dma_mapped; /* is page dma_mapped? */ ++ u16 dma_length; /* for dma_unmap_page() */ ++ u16 padding; ++ struct page *page; /* may be NULL (coherent mem) */ ++ void *kvaddr; /* FIXME: only for pio hack */ ++ dma_addr_t addr; ++ } addr[4]; /* max pages, any more and we coalesce */ ++}; ++ ++struct qib_user_sdma_queue { ++ /* ++ * pkts sent to dma engine are queued on this ++ * list head. the type of the elements of this ++ * list are struct qib_user_sdma_pkt... ++ */ ++ struct list_head sent; ++ ++ /* ++ * Because above list will be accessed by both process and ++ * signal handler, we need a spinlock for it. ++ */ ++ spinlock_t sent_lock ____cacheline_aligned_in_smp; ++ ++ /* headers with expected length are allocated from here... */ ++ char header_cache_name[64]; ++ struct dma_pool *header_cache; ++ ++ /* packets are allocated from the slab cache... */ ++ char pkt_slab_name[64]; ++ struct kmem_cache *pkt_slab; ++ ++ /* as packets go on the queued queue, they are counted... */ ++ u32 counter; ++ u32 sent_counter; ++ /* pending packets, not sending yet */ ++ u32 num_pending; ++ /* sending packets, not complete yet */ ++ u32 num_sending; ++ /* global descq number of entry of last sending packet */ ++ u64 added; ++ ++ /* dma page table */ ++ struct rb_root dma_pages_root; ++ ++ struct qib_user_sdma_rb_node *sdma_rb_node; ++ ++ /* protect everything above... */ ++ struct mutex lock; ++}; + + struct qib_user_sdma_queue * + qib_user_sdma_queue_create(struct device *dev, int unit, int port, int sport); + void qib_user_sdma_queue_destroy(struct qib_user_sdma_queue *pq); +- ++void *qib_user_sdma_alloc_header(struct qib_user_sdma_queue *pq, ++ size_t len, dma_addr_t *dma_addr); ++void qib_user_sdma_init_frag(struct qib_user_sdma_pkt *pkt, ++ int i, u16 offset, u16 len, ++ u16 first_desc, u16 last_desc, ++ u16 put_page, u16 dma_mapped, ++ struct page *page, void *kvaddr, ++ dma_addr_t dma_addr, u16 dma_length); ++int qib_user_sdma_page_to_frags(const struct qib_devdata *dd, ++ struct qib_user_sdma_queue *pq, ++ struct qib_user_sdma_pkt *pkt, ++ struct page *page, u16 put, ++ u16 offset, u16 len, void *kvaddr); ++int qib_user_sdma_hwqueue_clean(struct qib_pportdata *ppd); ++int qib_user_sdma_queue_clean(struct qib_pportdata *ppd, ++ struct qib_user_sdma_queue *pq); ++void qib_user_sdma_free_pkt_list(struct device *dev, ++ struct qib_user_sdma_queue *pq, ++ struct list_head *list); ++int qib_user_sdma_push_pkts(struct qib_pportdata *ppd, ++ struct qib_user_sdma_queue *pq, ++ struct list_head *pktlist, int count); + int qib_user_sdma_writev(struct qib_ctxtdata *pd, + struct qib_user_sdma_queue *pq, + const struct iovec *iov, +@@ -50,3 +146,8 @@ void qib_user_sdma_queue_drain(struct qib_pportdata *ppd, + + u32 qib_user_sdma_complete_counter(const struct qib_user_sdma_queue *pq); + u32 qib_user_sdma_inflight_counter(struct qib_user_sdma_queue *pq); ++/* ++ * This function prototype somewhat polutes this header file ++ * but I don't want to create a new header file just for it. ++ */ ++void qib_knx_sdma_free_pkt(struct qib_user_sdma_pkt *pkt); +diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c +index 092b0bb..687c216 100644 +--- a/drivers/infiniband/hw/qib/qib_verbs.c ++++ b/drivers/infiniband/hw/qib/qib_verbs.c +@@ -621,6 +621,15 @@ void qib_ib_rcv(struct qib_ctxtdata *rcd, void *rhdr, void *data, u32 tlen) + if (unlikely(tlen < 24)) + goto drop; + ++ if (ppd->mode_flag & QIB_PORT_SNOOP_MODE) { ++ int nomatch = 0; ++ if (ppd->filter_callback) ++ nomatch = ppd->filter_callback(hdr, data, ++ ppd->filter_value); ++ if (nomatch == 0 && ++ qib_snoop_rcv_queue_packet(ppd, rhdr, data, tlen)) ++ goto drop; ++ } + /* Check for a valid destination LID (see ch. 7.11.1). */ + lid = be16_to_cpu(hdr->lrh[1]); + if (lid < QIB_MULTICAST_LID_BASE) { +@@ -789,11 +798,17 @@ static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off) + #endif + + static void copy_io(u32 __iomem *piobuf, struct qib_sge_state *ss, +- u32 length, unsigned flush_wc) ++ u32 length, unsigned flush_wc, struct snoop_packet *packet, ++ u8 *data_orig) + { + u32 extra = 0; + u32 data = 0; + u32 last; ++ u32 *packet_data = NULL; ++ ++ /* This ensures copying word at a time */ ++ if (packet) ++ packet_data = (u32 *)data_orig; + + while (1) { + u32 len = ss->sge.length; +@@ -825,6 +840,10 @@ static void copy_io(u32 __iomem *piobuf, struct qib_sge_state *ss, + } + __raw_writel(data, piobuf); + piobuf++; ++ if (packet_data) { ++ *packet_data = data; ++ packet_data++; ++ } + extra = 0; + data = 0; + } else { +@@ -851,6 +870,10 @@ static void copy_io(u32 __iomem *piobuf, struct qib_sge_state *ss, + data = get_upper_bits(v, ushift); + piobuf++; + addr++; ++ if (packet_data) { ++ *packet_data = data; ++ packet_data++; ++ } + l -= sizeof(u32); + } + /* +@@ -868,6 +891,10 @@ static void copy_io(u32 __iomem *piobuf, struct qib_sge_state *ss, + } + __raw_writel(data, piobuf); + piobuf++; ++ if (packet_data) { ++ *packet_data = data; ++ packet_data++; ++ } + extra = 0; + data = 0; + } else { +@@ -894,12 +921,20 @@ static void copy_io(u32 __iomem *piobuf, struct qib_sge_state *ss, + qib_pio_copy(piobuf, ss->sge.vaddr, w - 1); + piobuf += w - 1; + last = ((u32 *) ss->sge.vaddr)[w - 1]; ++ if (packet_data) { ++ memcpy(packet_data, ss->sge.vaddr, len); ++ packet_data += w; ++ } + break; + } else { + u32 w = len >> 2; + + qib_pio_copy(piobuf, ss->sge.vaddr, w); + piobuf += w; ++ if (packet_data) { ++ memcpy(packet_data, ss->sge.vaddr, len); ++ packet_data += w; ++ } + + extra = len & (sizeof(u32) - 1); + if (extra) { +@@ -1144,12 +1179,13 @@ static int qib_verbs_send_dma(struct qib_qp *qp, struct qib_ib_header *hdr, + u32 control; + u32 ndesc; + int ret; ++ struct snoop_packet *packet = NULL; + + tx = qp->s_tx; + if (tx) { + qp->s_tx = NULL; + /* resend previously constructed packet */ +- ret = qib_sdma_verbs_send(ppd, tx->ss, tx->dwords, tx); ++ ret = qib_sdma_verbs_send(ppd, tx->ss, tx->dwords, tx, NULL); + goto bail; + } + +@@ -1173,6 +1209,19 @@ static int qib_verbs_send_dma(struct qib_qp *qp, struct qib_ib_header *hdr, + if (plen + 1 > dd->piosize2kmax_dwords) + tx->txreq.flags |= QIB_SDMA_TXREQ_F_USELARGEBUF; + ++ if (ppd->mode_flag) { ++ int nomatch = 0; ++ if (ppd->filter_callback) ++ nomatch = ppd->filter_callback(hdr, NULL, ++ ppd->filter_value); ++ if (nomatch == 0) { ++ packet = kzalloc(sizeof(*packet)+QIB_GET_PKT_LEN(hdr), ++ GFP_ATOMIC); ++ if (packet) ++ packet->total_len = QIB_GET_PKT_LEN(hdr); ++ } ++ } ++ + if (len) { + /* + * Don't try to DMA if it takes more descriptors than +@@ -1193,7 +1242,9 @@ static int qib_verbs_send_dma(struct qib_qp *qp, struct qib_ib_header *hdr, + tx->txreq.addr = dev->pio_hdrs_phys + + tx->hdr_inx * sizeof(struct qib_pio_header); + tx->hdr_dwords = hdrwords + 2; /* add PBC length */ +- ret = qib_sdma_verbs_send(ppd, ss, dwords, tx); ++ if (packet) ++ memcpy(packet->data, hdr, (hdrwords << 2)); ++ ret = qib_sdma_verbs_send(ppd, ss, dwords, tx, packet); + goto bail; + } + +@@ -1206,6 +1257,12 @@ static int qib_verbs_send_dma(struct qib_qp *qp, struct qib_ib_header *hdr, + phdr->pbc[1] = cpu_to_le32(control); + memcpy(&phdr->hdr, hdr, hdrwords << 2); + qib_copy_from_sge((u32 *) &phdr->hdr + hdrwords, ss, len); ++ if (packet) { ++ memcpy(packet->data, &phdr->hdr, (hdrwords << 2)); ++ memcpy(packet->data+(hdrwords << 2), ++ (u8 *)((u32 *) &phdr->hdr + hdrwords), ++ len); ++ } + + tx->txreq.addr = dma_map_single(&dd->pcidev->dev, phdr, + tx->hdr_dwords << 2, DMA_TO_DEVICE); +@@ -1214,7 +1271,7 @@ static int qib_verbs_send_dma(struct qib_qp *qp, struct qib_ib_header *hdr, + tx->align_buf = phdr; + tx->txreq.flags |= QIB_SDMA_TXREQ_F_FREEBUF; + tx->txreq.sg_count = 1; +- ret = qib_sdma_verbs_send(ppd, NULL, 0, tx); ++ ret = qib_sdma_verbs_send(ppd, NULL, 0, tx, NULL); + goto unaligned; + + map_err: +@@ -1222,9 +1279,24 @@ map_err: + err_tx: + qib_put_txreq(tx); + ret = wait_kmem(dev, qp); ++ /* If wait_kmem returns 0 then ++ * (ret==0) will hold true and we don't want ++ * that as it will add ignored packet in list, ++ * so free packet here. ++ */ ++ kfree(packet); ++ packet = NULL; + unaligned: + ibp->n_unaligned++; + bail: ++ if (packet) { ++ if (ret == 0) ++ qib_snoop_send_queue_packet(ppd, packet); ++ else { ++ kfree(packet); ++ packet = NULL; ++ } ++ } + return ret; + bail_tx: + ret = PTR_ERR(tx); +@@ -1280,6 +1352,8 @@ static int qib_verbs_send_pio(struct qib_qp *qp, struct qib_ib_header *ibhdr, + unsigned flush_wc; + u32 control; + u32 pbufn; ++ u8 *data_orig = NULL; ++ struct snoop_packet *packet = NULL; + + control = dd->f_setpbc_control(ppd, plen, qp->s_srate, + be16_to_cpu(ibhdr->lrh[0]) >> 12); +@@ -1288,6 +1362,20 @@ static int qib_verbs_send_pio(struct qib_qp *qp, struct qib_ib_header *ibhdr, + if (unlikely(piobuf == NULL)) + return no_bufs_available(qp); + ++ if (snoop_enable && ppd->mode_flag) { ++ int nomatch = 0; ++ if (ppd->filter_callback) ++ nomatch = ppd->filter_callback(ibhdr, NULL, ++ ppd->filter_value); ++ if (nomatch == 0) { ++ packet = kzalloc(sizeof(*packet)+QIB_GET_PKT_LEN(ibhdr), ++ GFP_ATOMIC); ++ if (packet) { ++ INIT_LIST_HEAD(&packet->list); ++ packet->total_len = QIB_GET_PKT_LEN(ibhdr); ++ } ++ } ++ } + /* + * Write the pbc. + * We have to flush after the PBC for correctness on some cpus +@@ -1297,6 +1385,12 @@ static int qib_verbs_send_pio(struct qib_qp *qp, struct qib_ib_header *ibhdr, + piobuf_orig = piobuf; + piobuf += 2; + ++ if (packet) { ++ /* Copy header */ ++ data_orig = packet->data; ++ memcpy(data_orig, hdr, (hdrwords << 2)); ++ data_orig += (hdrwords << 2); ++ } + flush_wc = dd->flags & QIB_PIO_FLUSH_WC; + if (len == 0) { + /* +@@ -1336,10 +1430,19 @@ static int qib_verbs_send_pio(struct qib_qp *qp, struct qib_ib_header *ibhdr, + qib_flush_wc(); + } else + qib_pio_copy(piobuf, addr, dwords); ++ if (packet) { ++ /* Copy data */ ++ memcpy(data_orig, addr, len); ++ data_orig += len; ++ } + goto done; + } +- copy_io(piobuf, ss, len, flush_wc); ++ copy_io(piobuf, ss, len, flush_wc, packet, data_orig); + done: ++ if (packet) { ++ qib_snoop_send_queue_packet(ppd, packet); ++ packet = NULL; ++ } + if (dd->flags & QIB_USE_SPCL_TRIG) { + u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023; + qib_flush_wc(); +@@ -1623,7 +1726,8 @@ static int qib_query_port(struct ib_device *ibdev, u8 port, + props->max_vl_num = qib_num_vls(ppd->vls_supported); + props->init_type_reply = 0; + +- props->max_mtu = qib_ibmtu ? qib_ibmtu : IB_MTU_4096; ++ props->max_mtu = QIB_MODPARAM_GET(ibmtu, dd->unit, ppd->port) ? ++ QIB_MODPARAM_GET(ibmtu, dd->unit, ppd->port) : IB_MTU_4096; + switch (ppd->ibmtu) { + case 4096: + mtu = IB_MTU_4096; +-- +1.7.1 + -- 2.41.0